# **1. Data Manipulation**

In [None]:
# We saw there were some missing values in the dataset
# Checking for missing values
def check_null_values(data):
    '''
    data: requires a DataFrame object. 
    ---
    returns: A DataFrame with details about missing values.
    '''
    count_nulls = [sum(data[x].isnull()) for x in data.columns]
    null_values = pd.DataFrame({
                                'Null Values': [any(data[x].isnull()) for x in data.columns],
                                'Count_Nulls': count_nulls,
                                'Percentage_Nulls': list((np.array(count_nulls) * 100) / len(df)),
                                'Dtype': data.dtypes
                              })
    return null_values.sort_values(by='Count_Nulls', ascending=False)

In [None]:
# Replacing % with a none value
def str_to_int(val):
    '''
    val: string value to be converted to int
    ----
    returns a integer value
    '''
    new_val = re.sub('%', '', val)
    return (int(new_val))

# Rounding the ratings for EDA
def round_fix(data):
    '''
    data: A numeric string value
    ----
    returns a category based on integer value
    '''
    data_str = str(data).strip()
    if data_str != 'NA':
        data = str_to_int(data_str)
        if data <= 40:
            return 'Very_Bad'
        if data > 40 and data <= 55:
            return 'Bad'
        if data > 55 and data <= 70:
            return 'Average'
        if data > 70 and data <= 85:
            return 'Good'
        if data <= 100:
            return 'Very_Good'
    else:
        return 'NA'

In [None]:
def round_fix_imdb(data):
    '''
    data: A numeric string value
    ----
    returns a category based on integer value
    '''
    if data != 'NA':
        data = float(data)
        out=None
        if data <= 4.0:
            return 'Very_Bad'
        if data > 4.0 and data <= 5.5:
            return 'Bad'
        if data > 5.5 and data <= 7.0:
            return 'Average'
        if data > 7.0 and data <= 8.5:
            return 'Good'
        if data<=10: 
            return 'Very_Good'
    else:
        return 'NA'

In [None]:
# This function will return a list of categories present in the dataset
def get_categories(data):
    '''
    data: A series having multiple string catgories
    ----
    returns complete set of categories present
    '''
    test = data[1]
    for i in data[1:]:
        if str(i).strip() != 'nan':
            test = test + ',' + i
    test = test.split(',')
    return list(set(test))


# To keep all the categories for every column update
def keep_categories(data, columns):
    '''
    data: dataframe object
    columns: column names present in the dataframe object that should be returned.
    ----
    returns a dictionary that has all the categories for respective columns
    '''
    kata = {column: None for column in columns}
    for column in columns:
        kata[column] = get_categories(data[column])
    return kata


# This function will return the count of each category from the dataset
def get_cat_order(data, col):
    '''
    data: Dataframe object
    col: Name of the column present in the dataframe
    ----
    returns the count of each category from the dataset
    '''
    cat = get_categories(data)
    # Create a dictionary to store the values
    cata = {
        col + '_' + category: [0 for i in range(len(data))]
        for category in cat
    }
    element_index = 0
    for element in data:
        element = element.split(',')
        for category in element:
            cata[col + '_' + category][element_index] = 1
        element_index += 1
    return cata


# This function will replace the original column with new columns of categories
def update_data(data, col, cata):
    '''
    data: Dataframe object
    col: column names present in the dataframe to be dropped
    cata: dictionary to be appended to the dataframe
    ----
    returns an updated dataframe object by removing the columns passed in col and appending columns in cata.
    '''
    for i in cata:
        data[i] = cata[i]
    data.drop(col, inplace=True, axis=1)
    return data


# This will apply encoding over all the columnns
# If get_kata is 1 then it will call keep_categories in itself and return two values
# Otherwise, only single dataframe will be returned
def apply_encoding(data, columns, get_kata):
    '''
    data: dataframe object
    columns: column names present in the dataframe
    get_kata: either to keep categories or not.
                1: keep categories
                0: don't keep the categories
    '''
    if get_kata == 1:
        kata = keep_categories(data, columns)
    for column in columns:
        cata = get_cat_order(data[column], column)
        data = update_data(data, column, cata)
    if get_kata == 1:
        return kata, data
    else:
        return data

In [None]:
def get_counts(data, col, categories):
    '''
    data: dataframe object
    col: name of the column
    categories: categories present
    ----
    return a dictionary with counts of each category
    '''
    cata = {category: None for category in categories}
    for category in categories:
        cata[category] = data[col + '_' + category].sum()
    return cata

In [None]:
def get_ott_counts(data, platforms, column):
    '''
    data: Dataframe object
    platforms: Name of the OTT platforms
    column: Name of the column to get counts for
    ----
    returns the count for column given for all the platforms 
    '''
    out = {'ID': [], column: [], 'Count': [], 'Platform': []}
    iid = 0
    for platform in platforms:
        print('>>>> Done: ' + platform)
        years = data[data[platform] == 1].groupby(column)
        for year in years.groups:
            year_grp = years.get_group(year)
            out['ID'].append(iid)
            out[column].append(year)
            out['Count'].append(year_grp.shape[0])
            out['Platform'].append(platform)
            iid += 1
    return pd.DataFrame(out)

In [None]:
# Let's start processing the age column first into a categorical column
def convertAge(age):
    if age!='NA':
        if age=='18+':
            return 'Adult'
        elif age=='13+' or age=='16+':
            return 'Teen'
        else:
            return 'Non-Adult'
    else:
        return age

In [None]:
# This function will return the count of each category from the dataset
def encode_data(data, cat, col):
    # Create a dictionary to store the values
    cata = {
        col + '_' + category: [0 for i in range(len(data))]
        for category in cat
    }
    element_index = 0
    for element in data:
        element = element.split(',')
        for category in element:
            try:
                cata[col + '_' + category][element_index] = 1
            except KeyError:
                cata[col+'_'+'Others'][element_index]=1
        element_index += 1
    return cata

In [None]:
# Creating a function to convert country names into Continents
def continentName(data, col, length):
    '''
    data: country name
    col: string value to be prefixed
    length: Now of rows in the dataset
    ----
    returns a continent name for each country that belongs too
    '''
    # Create a dictionary to store the values
    continents = ['Africa', 'Antarctica', 'Asia', 'Europe', 'North America', 'Oceania', 'South America', 'NA']
    
    #Creating a dictionary of each continent
    cata = {col+'_'+continent:[0 for i in range(length)] for continent in continents}
    
    #some of the country names are not avaliable so we will keep a dictionary for those names
    unavailable_names={'Soviet Union':'ET', 
                       'Republic of North Macedonia':'MK',
                       'Federal Republic of Yugoslavia':'RS',
                       'The Democratic Republic Of Congo':'CD',
                       'Isle Of Man':'IMN',
                       'Yugoslavia':'RS',
                       'Czechoslovakia':'CZ',
                       'Serbia and Montenegro':'CS',
                       'Kosovo':'IMN',
                       'U.S. Virgin Islands':'VI',
                       'Reunion':'RE'}
    # Iterating throught each country and flipping the value by 1 if that country is present in that continent
    element_index = 0
    for element in data:
        country_names = element.split(',')
        for country_name in country_names:
            if country_name != 'NA':
                for direction in ['East','West','South','North']:
                    if 'Africa' in country_name or 'Korea' in country_name or 'Macedonia' in country_name:
                        if country_name=='Korea':
                            country_name = 'North '+country_name
                        break
                    country_name = country_name.replace(direction,'').strip()
                if country_name in unavailable_names.keys():
                    country_code = unavailable_names[country_name]
                else:
                    country_code = pc.country_name_to_country_alpha2(country_name, cn_name_format="default")
                if country_code in ['VA','CS','IMN']:
                    continent_name ='Europe'
                else:
                    continent_code = pc.country_alpha2_to_continent_code(country_code)
                    continent_name = pc.convert_continent_code_to_continent_name(continent_code)
            cata[col+'_'+continent_name][element_index]=1
        element_index+=1
    return cata

In [None]:
def yearConvert(year):
    if year <= 1940:
        return 'Old'
    elif year <= 1970:
        return 'Vintage'
    elif year <= 1990:
        return 'Golden'
    elif year <= 2010:
        return 'Modern'
    else:
        return 'Latest'

# **2. Data Visualization**

In [None]:
# Function to plot value counts plots
def plot_value_counts_bar(data, col):
    '''
    data: Dataframe.
    col : Name of the column to be plotted.
    ----
    returns a plotly figure
    '''
    value_counts = pd.DataFrame(data[col].value_counts())
    value_counts['cat'] = value_counts.index
    fig = px.bar(value_counts, x='cat', y=col, color='cat', title=col)
    fig.update_layout()

    return fig

In [None]:
def plot_category_counts_bar(Category):
    # Get base counts for the category for plotting
    base_counts = df.loc[:, df.columns.str.startswith(Category)].sum()

    # Convert sum to dataframe
    base_counts = base_counts.to_frame(name='Counts').reset_index()

    # Split name "_"
    base_counts[['index',Category]] = base_counts['index'].str.split('_',expand=True)

    # Sort values
    base_counts = base_counts.sort_values(by='Counts',ascending=False)

    # Visualize data
    fig = px.bar(base_counts, x=Category, y='Counts', color=Category)
    
    fig.update_layout()

    return fig

In [None]:
def replaceNAby1(rating):
    if rating=='NA':
        return '-1'
    else:
        return rating

# **3. Recommender System**

In [None]:
def create_soup(data):
    # Creating a simple text for countvectorizer to work with
    att = data['Title'].lower()
    for i in data[1:]:
            att = att + ' ' + str(i)
            return att