In [59]:
import math
import pandas as pd
import numpy as np
import random

import warnings
warnings.filterwarnings('ignore')

In [60]:
# processed file load from pickle
df_processed = pd.read_pickle('./../data/df_processed.pkl')

In [61]:
def idf_score(column):
    """
    returns IDF of numpy array
    IDF = log((Total number of documents in corpus) / (Number of documents containing the term))
    """
    N = len(column)
    number_non_zero = column.astype(bool).sum(axis=0)
    return np.log(N / number_non_zero)

In [62]:
# get all float columns
float_columns = df_processed.select_dtypes(include=['float64']).columns
print(float_columns)
# normalize all columns except of float columns using idf method
columns_to_normalize = list(set(df_processed.columns) - set(float_columns))
print(f'about to normalize {len(columns_to_normalize)} columns')
for column in columns_to_normalize:
    # convert column to float
    df_processed[column] = df_processed[column].astype(float)
    # convert column to numpy array
    column_array = df_processed[column].to_numpy()

    idf = idf_score(column_array)
    if idf == math.inf:
        # drop column if idf is inf
        df_processed.drop(column, axis=1, inplace=True)
        continue
    if idf < 1 or random.randint(0, 100) == 0:
        print(f'column {column} has idf {idf}')
    # assign value idf to all non zero values
    column_array[column_array != 0] = idf
    # assign normalized values to column and sparse
    df_processed[column] = pd.SparseArray(column_array, dtype=float)

    


Index(['Released_Year', 'Runtime', 'IMDB_Rating', 'Meta_score', 'No_of_Votes',
       'Gross'],
      dtype='object')
about to normalize 3915 columns
column  has idf 0.0
column hotel has idf 5.298317366548036
column tennis has idf 5.809142990314028
column humanoid has idf 6.214608098422191
column texas has idf 5.298317366548036
column danish has idf 6.214608098422191
column survive has idf 4.605170185988092
column d has idf 0.015113637810048106
column Sohum Shah has idf 6.214608098422191
column m has idf 0.038740828316430616
column mumbai has idf 5.809142990314028
column a has idf 0.3538218749563259
column does has idf 4.8283137373023015
column and has idf 0.5025268209512956
column n has idf 0.4716049106127094
column le has idf 0.5310283310835101
column v has idf 0.31608154697347896
column Director_Neeraj Pandey has idf 5.521460917862246
column on has idf 0.38860799104174143
column o has idf 0.49593701127224005
column del has idf 5.298317366548036
column mars has idf 5.115995809754082


In [67]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 3898 entries, Released_Year to Bruno Kirby
dtypes: Sparse[float64, nan](3892), float64(6)
memory usage: 44.6 MB


In [None]:
df_processed.to_pickle('./data/df_processed_idf.pkl')