# Preprocessing and Feature Engineering

## Note: For visualizations, please refer to visualization.ipynb.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from skimage.io import imread, imshow
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_regression
%matplotlib inline

In [3]:
path_drive = 'data/ift6758-a20/'
df_train = pd.read_csv(path_drive + 'train.csv')
df_test = pd.read_csv(path_drive + 'test.csv')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
plt.style.use("seaborn")

### Replace NaN with "NaN"

In [4]:
def replace_to_string_NaN(col):
    return col.replace(np.nan,'NaN')

### Label Encode

In [5]:
def labelEncode(col):
    
    #transform missing values to string 'Nan'
    clean_col = replace_to_string_NaN(col)
    
    # Label encode the given values
    le = LabelEncoder()
    new_col = le.fit_transform(clean_col)
    return pd.DataFrame(new_col)

### Combine if less than

In [6]:
def combine_if_less_than(df, col_name, value):
    df.loc[df.groupby(col_name)[col_name].transform('count').lt(value), col_name] = 'other'

### Replace Nan by Median

In [7]:
def replace_by_median(value, col):
    if np.isnan(value):
        return col.median()
    return value

In [8]:
def replace_zero_by_median(value, col):
    if value == 0:
        return col.median()
    return value

### Convert hex to RGB

In [9]:
def hexToRGB(h, name):
    try:
        rgb = [int(h[i:i+2], 16) for i in (0, 2, 4)]
        return pd.Series({f'{name}_R':rgb[0], f'{name}_G':rgb[1], f'{name}_B':rgb[2]})
    except:
        # if NaN return BLACK -> Can be modified if better result => (black or remove)
        return pd.Series({f'{name}_R': 0, f'{name}_G': 0 , f'{name}_B': 0 })

### Remove Outliers 

In [10]:
def remove_outlier(df):
    df = df[df.between(df.quantile(.15), df.quantile(.85))] # without outliers

### User Time Zone To Continent

In [11]:
def convert_to_continent(country):
    NA = ['Greenland','Central America','Mountain Time (US & Canada)','Eastern Time (US & Canada)','Pacific Time (US & Canada)','America/New_York','Central Time (US & Canada)','Atlantic Time (Canada)','Arizona','Alaska','America/Boise','America/Chicago','America/Los_Angeles','America/Denver'] 
    SA = ['Lima','Bogota','Buenos Aires','America/Sao_Paulo','America/Panama', 'America/Santiago','America/Sao_Paulo ','Brasilia','Mexico City','Caracas','Hawaii','Santiago','Quito','Caracas','America/Argentina/Buenos_Aires','America/Bogota','America/Hermosillo','America/Mexico_City']
    Eur = ['Belgrade','Prague','Kyiv','Copenhagen','Lisbon','Irkutsk', 'Stockholm','Ljubljana','Dublin','Paris','Bern','Moscow','Brussels','Vienna','London','Athens','Amsterdam','Berlin','Madrid','Rome','Istanbul','Zagreb','Vilnius','Volgograd','Warsaw','Yerevan']
    Asia = ['Islamabad ', 'Karachi', 'Jerusalem','Hanoi','Bangkok', 'Beijing','Kuala Lumpur','Kuwait','Chennai','Muscat','Seoul','Riyadh','Singapore','Mumbai','New Delhi','Sydney','Hong Kong','Baghdad','Asia/Colombo','Asia/Calcutta','Urumqi','Tehran','Tokyo','Jakarta','Abu Dhabi','Adelaide','Almaty','Wellington']       #asia and australia
    Afr = ['Nairobi', 'Casablanca','Cairo','Pretoria','Africa/Johannesburg','West Central Africa']
    Australia=['Brisbane','Melbourne']
    
    if (country in NA):
        return 'North America'
    elif (country in SA):
        return 'South America'
    elif (country in Eur):
        return 'Europe'
    elif (country in Afr):
        return 'Africa'
    elif (country in Asia):
        return 'Asia'
    elif (country in Australia):
        return 'Australia'
    else :
        return 'other' 

In [12]:
df_train.head()

Unnamed: 0,Id,User Name,Personal URL,Profile Cover Image Status,Profile Verification Status,Profile Text Color,Profile Page Color,Profile Theme Color,Is Profile View Size Customized?,UTC Offset,Location,Location Public Visibility,User Language,Profile Creation Timestamp,User Time Zone,Num of Followers,Num of People Following,Num of Status Updates,Num of Direct Messages,Profile Category,Avg Daily Profile Visit Duration in seconds,Avg Daily Profile Clicks,Profile Image,Num of Profile Likes
0,AL85S14OMDPF01I9,Mf9vfld4Vfe,,Set,Verified,db1a2c,eaf0f2,e70409,False,39600.0,,Enabled,en,Thu Nov 27 05:24:59 +0000 2008,Sydney,95763,4289,30809,873,business,14.792,1.5761,AL85S14OMDPF01I9.png,2815
1,HI11QOPD7BLJTO7Q,xl9gaGN0hxM_,,Set,Verified,0099cc,f6ffd1,fff04d,False,,mumbai,Enabled,en,Fri Jan 15 18:00:46 +0000 2010,,1018746,289,8150,290,unknown,8.183,11.2782,HI11QOPD7BLJTO7Q.png,1242
2,JS49LP5P72RI1OQB,d_uiMm,,Set,Not verified,1fc2de,efefef,1fc2de,False,-18000.0,NYC + 70 Countries Worldwide,Enabled,en,Fri Oct 02 20:15:06 +0000 2009,Central Time (US & Canada),13444,1876,4698,227,unknown,31.823,0.5725,JS49LP5P72RI1OQB.png,1559
3,S0GDSC09MACCLBJP,hfylaRr,https://blob/e/g9pex_vS.com,Not set,Verified,050000,616161,00090a,False,-14400.0,"Indianapolis, In",Enabled,en,Thu Feb 19 14:37:22 +0000 2009,Eastern Time (US & Canada),339168,1148,53216,4035,business,23.052,4.0265,S0GDSC09MACCLBJP.png,6342
4,CRSEMK4QER6LDJSA,hRR1sDGlz5,https://blob/v/Szeo.h4/.com,Set,Not verified,58424d,f7f7f7,000000,False,-18000.0,"777 Beach Blvd. Biloxi, MS",Enabled,en,Tue Mar 31 13:27:52 +0000 2009,Central Time (US & Canada),9215,93,3271,130,unknown,8.418,3.9229,CRSEMK4QER6LDJSA.png,1078


## Pre-Processing

In [13]:
def preprocessing(df):
    
    if 'Num of Profile Likes' in df.columns:
        df['num_profile_likes_log'] = np.log(df['Num of Profile Likes'] + 1)
    
    # Length of User Name
    df['user_name_length'] = df['User Name'].str.len()
    df = df.drop(["User Name"], axis=1)
    
     # Binary URL 
    df['personal_url_binary'] = np.where(df['Personal URL'].notnull(), 1, 0)
    df = df.drop(["Personal URL"], axis=1)
    
    # Binary Profile Cover Image Status
    df['cover_image_binary'] = np.where(df['Profile Cover Image Status'] == 'Set', 1, 0)
    df = df.drop(["Profile Cover Image Status"], axis=1)
    
    # One-Hot encode Profile Verification Status
    df = df.merge(pd.get_dummies(df['Profile Verification Status'].str.lower(), prefix='verification_status'),
                             left_index=True, right_index=True)
    df = df.drop(["Profile Verification Status"], axis=1)
    
    # Set hex to RGB
    df[["Profile_Text_Color_R","Profile_Text_Color_G", "Profile_Text_Color_B"]] = df["Profile Text Color"].apply(lambda x: hexToRGB(x, "Profile_Text_Color"))
    df[["Profile_Page_Color_R","Profile_Page_Color_G", "Profile_Page_Color_B"]] = df["Profile Page Color"].apply(lambda x: hexToRGB(x, "Profile_Page_Color"))
    df[["Profile_Theme_Color_R","Profile_Theme_Color_G", "Profile_Theme_Color_B"]] = df["Profile Theme Color"].apply(lambda x: hexToRGB(x, "Profile_Theme_Color"))
    
    df = df.drop(["Profile Text Color", "Profile Page Color", "Profile Theme Color"], axis=1)
    
    # Profile view size customized? 
    # Already good because True or False
    
    # One-Hot encode UTC Offset
    df['utc_offset'] = np.where(df['UTC Offset'].isnull(), 'unk', df['UTC Offset'])
    df.loc[df.groupby('utc_offset')['utc_offset'].transform('count').lt(10), 'utc_offset'] = 'other'
    df = df.merge(pd.get_dummies(df['utc_offset'], prefix='utc_offset_'),
                  left_index=True, right_index=True)
    
    df = df.drop(["UTC Offset"], axis=1)

    # Clean up Location
    location_dict = {'Barcelona':'Barcelona, Spain', 'Brasil':'Brazil', 'Chicago':'Chicago, IL', 'Ciudad de México':'Mexico City, Mexico', 'España':'Spain', 'Global':'Worldwide', 'London':'London, UK', 'London, England':'London, UK', 'Los Angeles':'Los Angeles, CA', 'İstanbul, Türkiye':'Istanbul, Turkey', 'İstanbul':'Istanbul, Turkey', 'Istanbul':'Istanbul, Turkey', 'Jakarta':'Jakarta, Indonesia', 'Kingdom of Saudi Arabia':'Saudi Arabia', 'Las Vegas':'Las Vegas, NV', 'Madrid':'Madrid, Spain', 'México':'Mexico', 'Mumbai':'Mumbai, India', 'New York':'New York, NY', 'New York City':'New York, NY', 'NYC':'New York, NY', 'Paris':'Paris, France', 'Rio de Janeiro':'Rio de Janeiro, Brazil', 'São Paulo':'Sao Paulo, Brazil', 'São Paulo, Brasil':'Sao Paulo, Brazil', 'Türkiye':'Turkey', 'USA':'United States', 'UK':'United Kingdom', 'Washington, D.C.':'Washington, DC', 'المملكة العربية السعودي':'Saudi Arabia'}
    df['location_clean'] = df['Location'].str.replace('[0-9]', '', regex=True)
    df['location_clean'] = df['location_clean'].str.strip().replace(location_dict)

    
    # Location
    combine_if_less_than(df, "Location", 2)
    df["location"] = labelEncode(df["Location"])
    df["location"] = df["location"].apply(lambda x: replace_by_median(x, df["location"]))
    
    df = df.drop(["Location"], axis=1)
    
    # Binary Location Public Visibility
    df['location_public_visibility_binary'] = np.where(df['Location Public Visibility'].str.lower() == 'enabled', 1,
                                                 np.where(df['Location Public Visibility'].str.lower() == 'disabled', 0, 
                                                          2)) # 2 is '??'
    df = df.drop(["Location Public Visibility"], axis=1)
    
    
    # Label Encode encode User Language
    df['user_language'] = df['User Language'].str.replace('uk', 'en-gb')
    combine_if_less_than(df, "user_language", 10)
    df = df.merge(pd.get_dummies(df['user_language'], prefix='user_language_'),
                  left_index=True, right_index=True)
    
    df = df.drop(["User Language"], axis=1)
    
    
    # Format Profile Creation Timestamp
    df['profile_creation_timestamp'] = pd.to_datetime(df['Profile Creation Timestamp'], 
                                                            format='%a %b %d %H:%M:%S %z %Y')
    df['profile_creation_year'] = df['profile_creation_timestamp'].dt.year
    df['profile_creation_month'] = df['profile_creation_timestamp'].dt.month


    df = df.drop(["Profile Creation Timestamp", "profile_creation_timestamp"], axis=1)

    
    # User Time Zone
    df["user_timezone"] = df["User Time Zone"].apply(lambda x: convert_to_continent(x))
    df = df.merge(pd.get_dummies(df['user_timezone'].str.lower(), prefix='user_timezone'),
                             left_index=True, right_index=True)
    
    df = df.drop(["User Time Zone"], axis=1)

    
    # Set missing to median for Avg Daily Profile Visit Duration in seconds
    avg_daily_visit_duration_median = np.nanmedian(df['Avg Daily Profile Visit Duration in seconds'])
    df.loc[df['Avg Daily Profile Visit Duration in seconds'].isnull(), 'Avg Daily Profile Visit Duration in seconds'] = avg_daily_visit_duration_median
    
    # Set missing to median for Avg Daily Profile Visit Duration in seconds
    avg_daily_profile_clicks_median = np.nanmedian(df['Avg Daily Profile Clicks'])
    df.loc[df['Avg Daily Profile Clicks'].isnull(), 'Avg Daily Profile Clicks'] = avg_daily_profile_clicks_median

    # One-Hot encode Profile Category
    df['Profile Category'] = np.where(df['Profile Category'] == ' ', 'unknown', df['Profile Category'])
    df = df.merge(pd.get_dummies(df['Profile Category'].str.lower(), prefix='profile_category'),
                             left_index=True, right_index=True)
    df = df.drop(["Profile Category"], axis=1)
    
    
    # Log trasform integer variables
    df["Num of Followers"] = df["Num of Followers"].apply(lambda x: np.log(x))
    df["Num of People Following"] = df["Num of People Following"].apply(lambda x: np.log(x + 1))
    df["Num of Status Updates"] = df["Num of Status Updates"].apply(lambda x: np.log(x + 1))
    df["Num of Direct Messages"] = df["Num of Direct Messages"].apply(lambda x: np.log(x + 1))
    
    # Normalize features
    scaler = MinMaxScaler()
    df[["Num of Followers"]] = scaler.fit_transform(df[["Num of Followers"]])
    df[["Num of People Following"]] = scaler.fit_transform(df[["Num of People Following"]])
    df[["Num of Status Updates"]] = scaler.fit_transform(df[["Num of Status Updates"]])
    df[["Num of Direct Messages"]] = scaler.fit_transform(df[["Num of Direct Messages"]])
    
    # drop what we do not need
    df = df.drop(["Id", "Profile Image"], axis=1)
    
    return (df)
    
    

In [14]:
df_train_clean = preprocessing(df_train)
df_test_clean = preprocessing(df_test)

In [15]:
def aggregateCategories(df_train_clean, df_test_clean):
    """ Function creates bins based on the log number of profile likes. In essence, it groups categories 
    together based on the median number of likes. For example, any language below a median of 4 would be 
    category 1, any language below a median of 5 would be category 2, and so on.
    
    Args:
        df_train_clean {DataFrame}: Training dataset
        df_test_clean {DataFrame}: Testing dataset
    Return:
        df_train_clean {DataFrame}: Training dataset with bins variables
        df_test_clean {DataFrame}: Testing dataset with bins variables
        
    """
    
    # Username Length
    user_name_length_ref = pd.DataFrame(df_train_clean.groupby('user_name_length')['num_profile_likes_log'].agg('median')).reset_index()
    user_name_length_ref['user_name_length_bins'] = np.where(user_name_length_ref['num_profile_likes_log'] < 7.5, 1, 2)
    user_name_length_ref.drop('num_profile_likes_log', axis=1, inplace=True)
    
    # UTC Offset Bins
    utc_offset_ref = pd.DataFrame(df_train_clean.groupby('utc_offset')['num_profile_likes_log'].agg('median')).reset_index()
    utc_offset_ref['utc_offset_bins'] = np.where(utc_offset_ref['num_profile_likes_log'] < 5, 1,
                                         np.where(utc_offset_ref['num_profile_likes_log'] < 6, 2,
                                                  np.where(utc_offset_ref['num_profile_likes_log'] < 7, 3, 4)))
    utc_offset_ref.drop('num_profile_likes_log', axis=1, inplace=True)
    
    # User Language Bins
    user_language_ref = pd.DataFrame(df_train_clean.groupby('user_language')['num_profile_likes_log'].agg('median')).reset_index()
    user_language_ref['user_language_bins'] = np.where(user_language_ref['num_profile_likes_log'] < 4, 1,
                                                       np.where(user_language_ref['num_profile_likes_log'] < 5, 2,
                                                                np.where(user_language_ref['num_profile_likes_log'] < 6, 3,
                                                                         np.where(user_language_ref['num_profile_likes_log'] < 7, 4, 5))))
    user_language_ref.drop('num_profile_likes_log', axis=1, inplace=True)
    
    # User Timezone Bins
    user_timezone_ref = pd.DataFrame(df_train_clean.groupby('user_timezone')['num_profile_likes_log'].agg('median')).reset_index()
    user_timezone_ref['user_timezone_bins'] = np.where(user_timezone_ref['num_profile_likes_log'] < 4, 1,
                                                       np.where(user_timezone_ref['num_profile_likes_log'] < 5, 2,
                                                                np.where(user_timezone_ref['num_profile_likes_log'] < 6, 3,
                                                                         np.where(user_timezone_ref['num_profile_likes_log'] < 7, 4,
                                                                                  np.where(user_timezone_ref['num_profile_likes_log'] < 8, 5, 6)))))
    user_timezone_ref.drop('num_profile_likes_log', axis=1, inplace=True)

    df_train_clean = df_train_clean.merge(user_name_length_ref, on='user_name_length', how='left')
    df_train_clean = df_train_clean.merge(utc_offset_ref, on='utc_offset', how='left')
    df_train_clean = df_train_clean.merge(user_language_ref, on='user_language', how='left')
    df_train_clean = df_train_clean.merge(user_timezone_ref, on='user_timezone', how='left')
    
    
    df_test_clean = df_test_clean.merge(user_name_length_ref, on='user_name_length', how='left')
    df_test_clean = df_test_clean.merge(utc_offset_ref, on='utc_offset', how='left')
    df_test_clean = df_test_clean.merge(user_language_ref, on='user_language', how='left')
    df_test_clean = df_test_clean.merge(user_timezone_ref, on='user_timezone', how='left')
    
    
    
    df_train_clean = df_train_clean.drop(["utc_offset", "location_clean", "user_timezone", "user_language"], axis=1)
    df_test_clean = df_test_clean.drop(["utc_offset", "location_clean", "user_timezone", "user_language"], axis=1)
    
    return df_train_clean, df_test_clean

In [16]:
df_train_clean, df_test_clean = aggregateCategories(df_train_clean, df_test_clean)
df_train_clean.head()

Unnamed: 0,Is Profile View Size Customized?,Num of Followers,Num of People Following,Num of Status Updates,Num of Direct Messages,Avg Daily Profile Visit Duration in seconds,Avg Daily Profile Clicks,Num of Profile Likes,num_profile_likes_log,user_name_length,personal_url_binary,cover_image_binary,verification_status_not verified,verification_status_pending,verification_status_verified,Profile_Text_Color_R,Profile_Text_Color_G,Profile_Text_Color_B,Profile_Page_Color_R,Profile_Page_Color_G,Profile_Page_Color_B,Profile_Theme_Color_R,Profile_Theme_Color_G,Profile_Theme_Color_B,utc_offset__-10800.0,utc_offset__-14400.0,utc_offset__-18000.0,utc_offset__-21600.0,utc_offset__-25200.0,utc_offset__-28800.0,utc_offset__-36000.0,utc_offset__-39600.0,utc_offset__-7200.0,utc_offset__10800.0,utc_offset__14400.0,utc_offset__18000.0,utc_offset__19800.0,utc_offset__25200.0,utc_offset__28800.0,utc_offset__32400.0,utc_offset__3600.0,utc_offset__36000.0,utc_offset__39600.0,utc_offset__46800.0,utc_offset__7200.0,utc_offset__other,utc_offset__unk,location,location_public_visibility_binary,user_language__ar,user_language__de,user_language__en,user_language__en-gb,user_language__es,user_language__fr,user_language__id,user_language__it,user_language__ja,user_language__ko,user_language__nl,user_language__other,user_language__pl,user_language__pt,user_language__ru,user_language__tr,profile_creation_year,profile_creation_month,user_timezone_africa,user_timezone_asia,user_timezone_australia,user_timezone_europe,user_timezone_north america,user_timezone_other,user_timezone_south america,profile_category_business,profile_category_celebrity,profile_category_government,profile_category_unknown,user_name_length_bins,utc_offset_bins,user_language_bins,user_timezone_bins
0,False,0.583535,0.60531,0.647729,0.451868,14.792,1.5761,2815,7.943073,11,0,1,0,0,1,219,26,44,234,240,242,231,4,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,340,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2008,11,0,1,0,0,0,0,0,1,0,0,0,1,4,5,4
1,False,0.723466,0.410332,0.558407,0.378497,8.183,11.2782,1242,7.125283,12,0,1,0,0,1,0,153,204,246,255,209,255,240,77,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,543,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2010,1,0,0,0,0,0,1,0,0,0,0,1,1,3,5,4
2,False,0.467343,0.545488,0.521408,0.36222,31.823,0.5725,1559,7.352441,6,0,1,1,0,0,31,194,222,239,239,239,31,194,222,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,545,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2009,10,0,0,0,0,1,0,0,0,0,0,1,1,4,5,5
3,False,0.658377,0.509969,0.684443,0.553938,23.052,4.0265,6342,8.755107,7,1,0,0,0,1,5,0,0,97,97,97,0,9,10,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,545,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2009,2,0,0,0,0,1,0,0,1,0,0,0,1,4,5,5
4,False,0.44499,0.3288,0.497094,0.32525,8.418,3.9229,1078,6.98379,10,1,1,1,0,0,88,66,77,247,247,247,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,545,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2009,3,0,0,0,0,1,0,0,0,0,0,1,1,4,5,5


In [18]:
# There might be features (after one-hot) that are in test but not in train, or vice-versa
common_features = list(set(df_train_clean) & set(df_test_clean))

## Feature Selection

In [19]:
# Select the features to consider in the feature selection, and define target variable
X_train = df_train_clean[common_features]
X_test = df_test_clean[common_features]
y_train = df_train_clean['num_profile_likes_log']
X_train.head()

Unnamed: 0,utc_offset__10800.0,Profile_Page_Color_B,utc_offset__-10800.0,user_timezone_africa,user_language__en-gb,profile_category_unknown,utc_offset__-18000.0,user_language__ja,utc_offset_bins,Profile_Theme_Color_G,user_language__ar,utc_offset__28800.0,profile_category_government,Is Profile View Size Customized?,user_timezone_australia,user_name_length_bins,location_public_visibility_binary,user_language__ko,user_language__nl,profile_creation_month,utc_offset__other,utc_offset__unk,profile_creation_year,user_language__en,user_language__de,Profile_Theme_Color_B,Profile_Text_Color_R,profile_category_business,Profile_Text_Color_G,Num of Direct Messages,utc_offset__-36000.0,user_language__other,user_language__tr,user_timezone_asia,user_language_bins,user_timezone_north america,Avg Daily Profile Clicks,utc_offset__25200.0,Num of Status Updates,user_language__es,utc_offset__19800.0,user_language__it,utc_offset__-28800.0,verification_status_verified,Profile_Text_Color_B,utc_offset__-25200.0,user_name_length,utc_offset__14400.0,Avg Daily Profile Visit Duration in seconds,Num of People Following,utc_offset__32400.0,profile_category_celebrity,user_timezone_bins,utc_offset__3600.0,personal_url_binary,utc_offset__7200.0,Profile_Page_Color_R,Num of Followers,utc_offset__39600.0,utc_offset__-7200.0,user_language__pt,verification_status_not verified,Profile_Theme_Color_R,Profile_Page_Color_G,utc_offset__-14400.0,cover_image_binary,user_language__fr,user_timezone_europe,location,user_timezone_south america,user_language__ru,verification_status_pending,utc_offset__-21600.0,user_timezone_other
0,0,242,0,0,0,0,0,0,4,4,0,0,0,False,0,1,1,0,0,11,0,0,2008,1,0,9,219,1,26,0.451868,0,0,0,1,5,0,1.5761,0,0.647729,0,0,0,0,1,44,0,11,0,14.792,0.60531,0,0,4,0,0,0,234,0.583535,1,0,0,0,231,240,0,1,0,0,340,0,0,0,0,0
1,0,209,0,0,0,1,0,0,3,240,0,0,0,False,0,1,1,0,0,1,0,1,2010,1,0,77,0,0,153,0.378497,0,0,0,0,5,0,11.2782,0,0.558407,0,0,0,0,1,204,0,12,0,8.183,0.410332,0,0,4,0,0,0,246,0.723466,0,0,0,0,255,255,0,1,0,0,543,0,0,0,0,1
2,0,239,0,0,0,1,1,0,4,194,0,0,0,False,0,1,1,0,0,10,0,0,2009,1,0,222,31,0,194,0.36222,0,0,0,0,5,1,0.5725,0,0.521408,0,0,0,0,0,222,0,6,0,31.823,0.545488,0,0,5,0,0,0,239,0.467343,0,0,0,1,31,239,0,1,0,0,545,0,0,0,0,0
3,0,97,0,0,0,0,0,0,4,9,0,0,0,False,0,1,1,0,0,2,0,0,2009,1,0,10,5,1,0,0.553938,0,0,0,0,5,1,4.0265,0,0.684443,0,0,0,0,1,0,0,7,0,23.052,0.509969,0,0,5,0,1,0,97,0.658377,0,0,0,0,0,97,1,0,0,0,545,0,0,0,0,0
4,0,247,0,0,0,1,1,0,4,0,0,0,0,False,0,1,1,0,0,3,0,0,2009,1,0,0,88,0,66,0.32525,0,0,0,0,5,1,3.9229,0,0.497094,0,0,0,0,0,77,0,10,0,8.418,0.3288,0,0,5,0,1,0,247,0.44499,0,0,0,1,0,247,0,1,0,0,545,0,0,0,0,0


In [20]:
def featureSelection(X_train, X_test, y_train, scoring, bestK):
    """ Based on a correlation statistic, finds a subset of features to include in the datasets.
    
    Args:
        X_train (DataFrame): Training dataset with all features.
        X_test (DataFrame): Test dataset with all features.
        y_train (Series): Training target values.
        scoring (function): Correlation statistic tests. (ref: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection)
        bestK (int): The k best features with highest scores to include in model ('all' to keep every features).
    Return:
        X_train_fs (DataFrame): Training dataset with subset of best features.
        X_test_fs (DataFrame): Test dataset with subset of best features.
        scores (DataFrame): All the features with corresponding scores.
    """
    
    # Feature extraction based on correlations with target
    fs = SelectKBest(score_func=scoring, k=bestK)
    fs.fit(X_train, y_train)
    
    # Keep only the best K features
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)

    # Create dataframe of features and scores
    scores = pd.DataFrame()
    for i in range(len(fs.scores_)):
        scores = scores.append(
            pd.DataFrame({'feature':[X_train.columns[i]], 'score':[fs.scores_[i]]}),
            ignore_index=True)
    scores = scores.sort_values('score', ascending=False).reset_index(drop=True)
    
    # Return train and test dataframes with subset of features, and all the scores
    return X_train_fs, X_test_fs, scores

In [21]:
# Select subset of features to include in the models
bestK = 20
X_train_fs, X_test_fs, scores = featureSelection(X_train, X_test, y_train, f_regression, bestK)

In [22]:
# All the feature swith their scores
scores.head(bestK)

Unnamed: 0,feature,score
0,personal_url_binary,2764.659495
1,Num of People Following,1585.65425
2,Num of Status Updates,1400.653505
3,utc_offset_bins,607.574172
4,location_public_visibility_binary,566.390249
5,profile_category_unknown,522.044102
6,user_language_bins,451.375416
7,verification_status_verified,321.061044
8,user_timezone_bins,290.869626
9,user_timezone_north america,220.566771


### Data Export

In [24]:
df_train_clean.to_csv(path_drive + 'train_clean.csv', index=False)
df_test_clean.to_csv(path_drive + 'test_clean.csv', index=False)

## Pipeline Preprocessing + Feature Engineering + Feature Selection

In [26]:
df_train = pd.read_csv(path_drive + 'train.csv')
df_test = pd.read_csv(path_drive + 'test.csv')

# preprocessing
df_train_clean = preprocessing(df_train)
df_test_clean = preprocessing(df_test)

# aggregate categories
df_train_clean, df_test_clean = aggregateCategories(df_train_clean, df_test_clean)

# common features
common_features = list(set(df_train_clean) & set(df_test_clean))

# Select the features to consider in the feature selection, and define target variable
X_train = df_train_clean[common_features]
X_test = df_test_clean[common_features]
y_train = df_train_clean['num_profile_likes_log']

# Select subset of features to include in the models
bestK = 20
X_train_fs, X_test_fs, scores = featureSelection(X_train, X_test, y_train, f_regression, bestK)

best_features = scores.feature[:19]
df_train_clean = df_train_clean[best_features]
df_test_clean = df_test_clean[best_features]
df_train_clean["num_of_profile_likes"] = df_train['Num of Profile Likes']


df_train_clean.to_csv(path_drive + 'train_clean.csv', index=False)
df_test_clean.to_csv(path_drive + 'test_clean.csv', index=False)

# Modeling

In [40]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.svm import SVR

In [41]:
import itertools

In [42]:
path_drive = 'data/ift6758-a20/'
df_train = pd.read_csv(path_drive + 'train_clean.csv')
df_test = pd.read_csv(path_drive + 'test_clean.csv')

In [43]:
pd.set_option('display.max_columns', None)
np.set_printoptions(suppress=True) # don't use scientific notation

In [44]:
df_train.head()

Unnamed: 0,personal_url_binary,Num of People Following,Num of Status Updates,utc_offset_bins,location_public_visibility_binary,profile_category_unknown,user_language_bins,verification_status_verified,user_timezone_bins,user_timezone_north america,verification_status_not verified,profile_creation_year,Num of Direct Messages,user_timezone_asia,Is Profile View Size Customized?,Avg Daily Profile Clicks,utc_offset__32400.0,user_language__ja,profile_category_business,num_of_profile_likes
0,0,0.60531,0.647729,4,1,0,5,1,4,0,0,2008,0.451868,1,False,1.5761,0,0,1,2815
1,0,0.410332,0.558407,3,1,1,5,1,4,0,0,2010,0.378497,0,False,11.2782,0,0,0,1242
2,0,0.545488,0.521408,4,1,1,5,0,5,1,1,2009,0.36222,0,False,0.5725,0,0,0,1559
3,1,0.509969,0.684443,4,1,0,5,1,5,1,0,2009,0.553938,0,False,4.0265,0,0,1,6342
4,1,0.3288,0.497094,4,1,1,5,0,5,1,1,2009,0.32525,0,False,3.9229,0,0,0,1078


### Calculate RMSLE

In [45]:
def evaluate(predictions, y_test, metric):
    """ Function to evaluate the performance of model based on different metrics.
    
    Args:
        model (model): Classifier or regressor model to use in the prediction.
        y_test (ndarray): The training target values.
        test (string): The type of test (rmse or rmsle).
    Return:
        score (float): The average score.
    """
    
    # Calculate score
    if metric == 'rmse':
        score = np.sqrt(np.square(predictions - y_test).mean())
    elif metric == 'rmsle':
        score = np.sqrt(np.square(np.log(predictions + 1) - np.log(y_test + 1)).mean())
    else:
        score = 'Error'
    
    print(f'Score ({metric.upper()}): {score:.3f}')
    
    return score

### Custom KFoldCV

In [46]:
def KFoldCV(model, k, X, y, metric):
    """ Custom function to carry out K-Fold Cross-Validation on a training dataset manually.
    It allows for more flexibility as compared to cross_val_score().
    Args:
        model (model): Model used for prediction.
        k (int): The number of folds in CV.
        X (DataFrame/ndarray): Training dataset.
        y (Series/ndarray): Training target values.
        metric (string): The metric used to measure model performance ('rmse' or 'rmsle').
    """
    # K-Fold CV
    kf = KFold(n_splits=k)
    # Initialize list for scores
    scores = []
    # Ensure that X and y are ndarray
    if isinstance(X, pd.DataFrame):
        X = X.to_numpy()
    if isinstance(y, pd.Series):
        y = y.to_numpy()
        
    print('Manual K-Fold Cross-Validation:')
    
    # Carry out K-Fold CV
    for train_index, valid_index in kf.split(X):
        # Split data
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        
        # Fit model on training set
        model.fit(X_train, y_train)
        # Predict with validation set
        predictions = model.predict(X_valid)
        # Evaluate model
        scores.append(evaluate(predictions, y_valid, metric))
    
    scores = np.array(scores)
    print(f'Mean Accuracy: {scores.mean():0.3f} (+/- {scores.std() * 2:0.3f})')
    print('========================================')

### SKLearn CV function

In [47]:
def sklearnKFoldCV(model, k, X, y, metric):
    
    if metric == 'rmse':
        scoring_metric = 'neg_root_mean_squared_error'
        
    print('Sklearn K-Fold Cross-Validation:')
        
    cv=KFold(n_splits=k, shuffle=True, random_state=42)
    scores = abs(cross_val_score(model, X, y, scoring=scoring_metric, cv=cv))
    _ = [print(f'Score (RMSE): {score:.3f}') for score in scores]
    print(f'Mean Accuracy: {scores.mean():0.3f} (+/- {scores.std() * 2:0.3f})')

### Create Features & Target

In [48]:
features = df_train.drop(['num_of_profile_likes'], axis=1)
target = df_train['num_of_profile_likes'].apply(lambda x: np.log(x + 1))
features.head()

Unnamed: 0,personal_url_binary,Num of People Following,Num of Status Updates,utc_offset_bins,location_public_visibility_binary,profile_category_unknown,user_language_bins,verification_status_verified,user_timezone_bins,user_timezone_north america,verification_status_not verified,profile_creation_year,Num of Direct Messages,user_timezone_asia,Is Profile View Size Customized?,Avg Daily Profile Clicks,utc_offset__32400.0,user_language__ja,profile_category_business
0,0,0.60531,0.647729,4,1,0,5,1,4,0,0,2008,0.451868,1,False,1.5761,0,0,1
1,0,0.410332,0.558407,3,1,1,5,1,4,0,0,2010,0.378497,0,False,11.2782,0,0,0
2,0,0.545488,0.521408,4,1,1,5,0,5,1,1,2009,0.36222,0,False,0.5725,0,0,0
3,1,0.509969,0.684443,4,1,0,5,1,5,1,0,2009,0.553938,0,False,4.0265,0,0,1
4,1,0.3288,0.497094,4,1,1,5,0,5,1,1,2009,0.32525,0,False,3.9229,0,0,0


## Train Test Split

In [49]:
X_train, X_valid, y_train, y_valid = train_test_split(features, target, test_size=0.2, shuffle=True, random_state=42)

X_test = df_test

### Test with simple models

In [50]:
xtra_tree = ExtraTreesRegressor(max_depth=3)
xtra_tree.fit(X_train, y_train)
KFoldCV(xtra_tree, 5, features, target, 'rmse')
sklearnKFoldCV(xtra_tree, 5, features, target, 'rmse')

Manual K-Fold Cross-Validation:
Score (RMSE): 1.885
Score (RMSE): 1.940
Score (RMSE): 1.891
Score (RMSE): 1.924
Score (RMSE): 2.051
Mean Accuracy: 1.938 (+/- 0.120)
Sklearn K-Fold Cross-Validation:
Score (RMSE): 1.982
Score (RMSE): 1.888
Score (RMSE): 1.988
Score (RMSE): 1.841
Score (RMSE): 1.997
Mean Accuracy: 1.939 (+/- 0.127)


In [51]:
ada = AdaBoostRegressor(random_state=42)
ada.fit(X_train, y_train)
KFoldCV(ada, 5, features, target, 'rmse')
sklearnKFoldCV(ada, 5, features, target, 'rmse')

Manual K-Fold Cross-Validation:
Score (RMSE): 2.094
Score (RMSE): 2.020
Score (RMSE): 1.986
Score (RMSE): 2.093
Score (RMSE): 2.132
Mean Accuracy: 2.065 (+/- 0.108)
Sklearn K-Fold Cross-Validation:
Score (RMSE): 2.078
Score (RMSE): 2.047
Score (RMSE): 2.161
Score (RMSE): 2.015
Score (RMSE): 2.044
Mean Accuracy: 2.069 (+/- 0.100)


In [54]:
dt = DecisionTreeRegressor(max_depth=10, random_state=42, criterion="mse", splitter="best", max_features="auto", max_leaf_nodes=10)
dt.fit(X_train, y_train)
KFoldCV(dt, 5, features, target, 'rmse')
sklearnKFoldCV(dt, 5, features, target, 'rmse')

Manual K-Fold Cross-Validation:
Score (RMSE): 1.934
Score (RMSE): 1.946
Score (RMSE): 1.910
Score (RMSE): 1.912
Score (RMSE): 2.019
Mean Accuracy: 1.944 (+/- 0.079)
Sklearn K-Fold Cross-Validation:
Score (RMSE): 1.977
Score (RMSE): 1.908
Score (RMSE): 1.959
Score (RMSE): 1.859
Score (RMSE): 1.991
Mean Accuracy: 1.939 (+/- 0.098)


In [55]:
svr = SVR()
svr.fit(X_train, y_train)
KFoldCV(svr, 5, features, target, 'rmse')
sklearnKFoldCV(svr, 5, features, target, 'rmse')

Manual K-Fold Cross-Validation:
Score (RMSE): 2.544
Score (RMSE): 2.595
Score (RMSE): 2.610
Score (RMSE): 2.628
Score (RMSE): 2.714
Mean Accuracy: 2.618 (+/- 0.111)
Sklearn K-Fold Cross-Validation:
Score (RMSE): 2.677
Score (RMSE): 2.612
Score (RMSE): 2.698
Score (RMSE): 2.484
Score (RMSE): 2.617
Mean Accuracy: 2.618 (+/- 0.150)


In [56]:
svr = SVR(C=2, coef0=1, kernel="poly")
svr.fit(X_train, y_train)
KFoldCV(svr, 5, features, target, 'rmse')
sklearnKFoldCV(svr, 5, features, target, 'rmse')

Manual K-Fold Cross-Validation:
Score (RMSE): 2.527
Score (RMSE): 2.580
Score (RMSE): 2.594
Score (RMSE): 2.613
Score (RMSE): 2.698
Mean Accuracy: 2.602 (+/- 0.111)
Sklearn K-Fold Cross-Validation:
Score (RMSE): 2.663
Score (RMSE): 2.597
Score (RMSE): 2.681
Score (RMSE): 2.467
Score (RMSE): 2.601
Mean Accuracy: 2.602 (+/- 0.150)


## Grid Search CV

In [96]:
# param = {'C': [1, 2, 3, 4, 5], 'coef0': [0, 1, 2, 3, 4, 5], "cache_size": [200, 300, 400, 500, 600], "kernel": ["linear", "poly", "rbf", "sigmoid"], "degree": [1,2,3,4,5,6], "gamma": ["scale","auto"]}
param = {"cache_size": [5,10,15,20, 50]}
reg = SVR(C=2, coef0=1, kernel="poly")

#param = {'n_estimators': [50, 60, 70, 80, 90, 100], 'loss': ['linear', 'square', 'exponential']}
#reg = AdaBoostRegres sor(random_state=42)
 
#param = {'criterion': ["mse", "mae", "friedman_mse"], "splitter": ["best", "random"], "max_depth": [1,2,3,4,5,6], "min_samples_split": [2,3,4,5], "min_samples_leaf": [1,2,3,4], "max_features": ["auto", "sqrt", "log2"]}
#reg = DecisionTreeRegressor(random_state=42)

grids = GridSearchCV(reg, param, cv=5)
grids.fit(X_train, y_train)
print(grids.best_params_)

{'cache_size': 5}


# Best Models

In [100]:
best_reg = SVR(C=2, coef0=1, kernel="poly", cache_size=1000)
KFoldCV(best_reg, 5, features, target, 'rmse')

Score (RMSE): 1.822135325678774
Score (RMSE): 1.8382006777389766
Score (RMSE): 1.7977381510526695
Score (RMSE): 1.8194061317895012
Score (RMSE): 1.9403808979232184
Mean Accuracy: 1.84 (+/- 0.10)


In [None]:
best_reg = SVR(C=2, coef0=1, kernel="poly")
KFoldCV(best_reg, 10, features, target, 'rmse')

Score (RMSE): 1.819170288632793


## Pipeline

In [57]:
path_drive = 'data/ift6758-a20/'
df_train = pd.read_csv(path_drive + 'train_clean.csv')
df_test = pd.read_csv(path_drive + 'test_clean.csv')

# features & target
features = df_train.drop(['num_of_profile_likes'], axis=1)
target = df_train['num_of_profile_likes'].apply(lambda x: np.log(x + 1))

# train test split
X_train, X_valid, y_train, y_valid = train_test_split(features, target, test_size=0.3, shuffle=True, random_state=42, )

X_test = df_test

print("ADA:")
ada = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_features=None, max_depth=9), learning_rate=0.01, random_state=42)
ada.fit(X_train, y_train)
KFoldCV(ada, 5, features, target, 'rmse')
sklearnKFoldCV(ada, 5, features, target, 'rmse')
ada_predict = ada.predict(X_test)

ADA:
Manual K-Fold Cross-Validation:
Score (RMSE): 1.750
Score (RMSE): 1.776
Score (RMSE): 1.745
Score (RMSE): 1.774
Score (RMSE): 1.842
Mean Accuracy: 1.778 (+/- 0.069)
Sklearn K-Fold Cross-Validation:
Score (RMSE): 1.800
Score (RMSE): 1.749
Score (RMSE): 1.830
Score (RMSE): 1.714
Score (RMSE): 1.802
Mean Accuracy: 1.779 (+/- 0.084)


# Submit on Kaggle

In [15]:
df_test_init = pd.read_csv(path_drive + "test.csv")

In [23]:
regr = ada # CHOOSE REG FROM ABOVE 
predictions = regr.predict(X_test)
predictions = np.exp(predictions ) - 1
submission = pd.DataFrame({'Id':df_test_init['Id'], 'Predicted':predictions})
submission.to_csv('answers/submission_ADA_06_M.csv', index=False)

In [21]:
submission.head()

Unnamed: 0,Id,Predicted
0,49I3SOKLI2CMNGP4,3818.029154
1,727IRIR59A3P88LK,2543.840176
2,LN95SD15SRPCEE8F,569.785977
3,TB11I7F0PN033D4T,4124.536327
4,32PSGCK5PATHMR07,304.760169


# Choose best features combination

Function that tries all possible combination of features on a certain model. This was used in the beginning to see if even though some features had the best score we still could get a better model by selecting certain features above others.

This turns out to be very helpful for SVR as it required less than 10 features to achieve a good score. For adaboost It had no impact as the more features (around 20) lead to a better score

In [58]:
def auto_param_RMSLE():
    best_res = 10
    best_features = []
    under_2 = []
    
    features = select_features(df_train)
    print("Total length: ", len(features))
    for i in range(1,len(features)):
        print("-----------")
        print(i)
        combi = list(itertools.combinations(features, i))
        for tuples in combi:
            selection = df_train[list(tuples)]
            target = df_train['Num of Profile Likes']
            
            X_train, X_valid, y_train, y_valid = train_test_split(selection, target, test_size=0.2, shuffle=True, random_state=42)
            
            br = BaggingRegressor()
            br.fit(X_train, y_train)
            new_res = evaluate(br, X_valid, y_valid)
            
            best_res = min(best_res, new_res)
            
            if best_res == new_res:
                best_features = list(tuples)
            
            if new_res < 2:
                under_2.append([new_res,tuples])
        print("Best res now: ", best_res)
        print("best_features now: ",best_features)
    print("Overal best_res: ", best_res)
    print("Overal best_features: ",best_features)
    print("Overal under_2: ",under_2 ) 
            