## Importing Libraries

In [1]:
import gensim.downloader as api
import numpy as np
import pandas as pd 
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score

## Loading the word2vec-google-news-300 pretrained model

In [2]:
## load word2vec 
model = api.load("word2vec-google-news-300")

## Loading the prolific_data

In [5]:
df = pd.read_csv('prolific_data.csv')
print(df.shape)
df.head()

(210, 303)


Unnamed: 0,ID,Barack Obama,Michael Jackson,Eminem,Justin Bieber,Lady Gaga,Adolf Hitler,Lil Wayne,Rihanna,Miley Cyrus,...,Diego Maradona,Herbert Hoover,Niccolò Machiavelli,Kim Jong-un,Kevin Durant,Wilt Chamberlain,Kate Moss,race,gender,political orientation
0,20,,,60.0,,,21.0,,,,...,,,,,,,,Asian,Female,Democrat
1,78,,39.0,,40.0,,,,,,...,,,,,,69.0,,Asian,Female,Democrat
2,87,,30.0,,12.0,,,17.0,,,...,,,,,,,,Asian,Female,Democrat
3,92,93.0,38.0,,,,,,,,...,,,,,,,,Asian,Female,Democrat
4,149,,9.0,,,,,,,,...,,54.0,66.0,,,,,Asian,Female,Democrat


In [6]:
df.columns

Index(['ID', 'Barack Obama', 'Michael Jackson', 'Eminem', 'Justin Bieber',
       'Lady Gaga', 'Adolf Hitler', 'Lil Wayne', 'Rihanna', 'Miley Cyrus',
       ...
       'Diego Maradona', 'Herbert Hoover', 'Niccolò Machiavelli',
       'Kim Jong-un', 'Kevin Durant', 'Wilt Chamberlain', 'Kate Moss', 'race',
       'gender', 'political orientation'],
      dtype='object', length=303)

In [7]:
df.dtypes

ID                         int64
Barack Obama             float64
Michael Jackson          float64
Eminem                   float64
Justin Bieber            float64
                          ...   
Wilt Chamberlain         float64
Kate Moss                float64
race                      object
gender                    object
political orientation     object
Length: 303, dtype: object

## Data cleaning and manipulation

In [8]:
df = df.melt(id_vars=['ID', 'race', 'gender', 'political orientation']).dropna()
print(df.shape)
df.head()

(7891, 6)


Unnamed: 0,ID,race,gender,political orientation,variable,value
3,92,Asian,Female,Democrat,Barack Obama,93.0
5,150,Asian,Female,Democrat,Barack Obama,89.0
11,141,Asian,Male,Democrat,Barack Obama,93.0
15,112,Black or African American,Male,Democrat,Barack Obama,100.0
16,177,Black or African American,Male,Democrat,Barack Obama,100.0


In [9]:
## Aggregate the value of rating for each leader

df = df[["variable", "value"]]
df1 =df.groupby(['variable']).mean()

In [10]:
df1

Unnamed: 0_level_0,value
variable,Unnamed: 1_level_1
Abraham Lincoln,85.255319
Adam Sandler,36.157895
Adolf Hitler,43.419355
Adriana Lima,36.000000
Al Capone,44.925000
...,...
Yao Ming,46.133333
Zinedine Zidane,43.272727
Zlatan Ibrahimovi?,46.833333
Zodiac Killer,9.540541


In [11]:
## Store word vector of each leader in a dataframe  

names=list(df1.index)

df = pd.DataFrame(columns=names)

for name in names:
    try:
        df[name] = model[name.replace(" ", "_")]    
    except:
        try:
            df[name] = model[name.replace(" ", "_").lower()]
        except:
            try:
                df[name] = model[name.replace("-", "_").lower()]
            except: 
                pass

In [12]:
df

Unnamed: 0,Abraham Lincoln,Adam Sandler,Adolf Hitler,Adriana Lima,Al Capone,Alan Turing,Albert Einstein,Aleister Crowley,Alex Ferguson,Allen Iverson,...,William Shakespeare,Wilt Chamberlain,Winston Churchill,Wladimir Klitschko,Woodrow Wilson,Yao Ming,Zinedine Zidane,Zlatan Ibrahimovi?,Zodiac Killer,Zooey Deschanel
0,0.558594,0.196289,0.213867,0.007629,-0.047363,0.117676,0.108887,0.267578,0.058350,0.406250,...,0.213867,0.419922,0.205078,-0.104004,0.033203,0.285156,0.390625,,0.044189,0.042969
1,0.279297,0.101562,0.233398,-0.135742,0.138672,-0.042725,0.045654,0.106445,-0.204102,0.011292,...,0.078125,0.094727,0.123535,-0.175781,0.166016,0.515625,0.010864,,-0.151367,-0.243164
2,0.038818,-0.542969,0.373047,-0.142578,0.045410,0.238281,0.369141,0.097168,0.275391,-0.034424,...,0.144531,0.190430,0.417969,0.014404,0.188477,0.112305,0.511719,,-0.037598,-0.453125
3,0.361328,0.589844,0.154297,0.224609,0.322266,0.445312,0.226562,0.250000,0.180664,0.269531,...,0.443359,-0.070312,0.298828,0.212891,0.306641,0.101562,0.316406,,0.062256,-0.089844
4,0.137695,-0.263672,-0.324219,0.048340,0.201172,-0.140625,-0.097656,0.000572,0.265625,0.287109,...,0.085449,0.718750,0.094238,-0.083496,0.243164,0.092285,0.238281,,0.058838,-0.143555
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,-0.057861,-0.023193,-0.166016,0.073242,0.168945,-0.051270,0.061523,0.017456,-0.052246,0.063477,...,0.103027,-0.067871,-0.273438,-0.073242,-0.253906,0.209961,0.182617,,-0.177734,0.093750
296,-0.015869,-0.298828,0.006866,-0.070312,-0.028442,-0.241211,-0.104980,0.051025,-0.116699,-0.243164,...,0.069336,-0.416016,-0.019409,-0.503906,0.014832,-0.300781,-0.191406,,0.033447,-0.233398
297,0.025513,-0.298828,0.062988,-0.073730,-0.503906,-0.044922,0.044434,-0.163086,-0.014282,-0.024780,...,-0.151367,-0.042480,0.160156,-0.059082,0.061035,0.210938,-0.231445,,-0.351562,0.001541
298,-0.053467,-0.155273,-0.111328,-0.161133,0.038086,-0.018311,-0.090820,0.146484,0.324219,0.433594,...,-0.042480,-0.052002,-0.003296,-0.212891,0.335938,0.007690,-0.169922,,0.015137,-0.039307


In [13]:
## Transpose the matrix 

df2=df.transpose()

In [14]:
## annex the column with output to complete the table 

df2[300]=df1

df2=df2.dropna()
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
Abraham Lincoln,0.558594,0.279297,0.0388184,0.361328,0.137695,-0.554688,-0.137695,0.464844,0.324219,0.154297,...,-0.0209961,-0.124023,0.10791,0.165039,-0.0578613,-0.0158691,0.0255127,-0.0534668,0.169922,85.255319
Adam Sandler,0.196289,0.101562,-0.542969,0.589844,-0.263672,0.0529785,0.0203857,-0.322266,0.341797,-0.402344,...,0.601562,-0.287109,-0.175781,-0.065918,-0.0231934,-0.298828,-0.298828,-0.155273,0.0874023,36.157895
Adolf Hitler,0.213867,0.233398,0.373047,0.154297,-0.324219,-0.273438,-0.108398,0.0830078,0.496094,0.10498,...,-0.0203857,-0.326172,0.225586,-0.302734,-0.166016,0.00686646,0.0629883,-0.111328,-0.0126343,43.419355
Adriana Lima,0.00762939,-0.135742,-0.142578,0.224609,0.0483398,-0.0219727,0.0532227,-0.679688,0.0103149,0.361328,...,0.0209961,-0.449219,-0.163086,0.139648,0.0732422,-0.0703125,-0.0737305,-0.161133,-0.172852,36.000000
Al Capone,-0.0473633,0.138672,0.0454102,0.322266,0.201172,-0.0415039,-0.103516,0.0771484,0.464844,0.139648,...,-0.026001,-0.106445,0.0151978,0.0986328,0.168945,-0.0284424,-0.503906,0.0380859,0.0766602,44.925000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Woodrow Wilson,0.0332031,0.166016,0.188477,0.306641,0.243164,-0.455078,-0.0678711,0.209961,-0.193359,-0.0922852,...,-0.144531,-0.151367,0.279297,0.246094,-0.253906,0.0148315,0.0610352,0.335938,-0.0539551,71.735294
Yao Ming,0.285156,0.515625,0.112305,0.101562,0.0922852,-0.287109,0.306641,-0.209961,0.210938,-0.10791,...,0.511719,0.355469,-0.419922,-0.00204468,0.209961,-0.300781,0.210938,0.00769043,-0.240234,46.133333
Zinedine Zidane,0.390625,0.0108643,0.511719,0.316406,0.238281,-0.240234,0.235352,-0.535156,0.378906,-0.0703125,...,0.189453,0.324219,0.176758,-0.060791,0.182617,-0.191406,-0.231445,-0.169922,0.382812,43.272727
Zodiac Killer,0.0441895,-0.151367,-0.0375977,0.0622559,0.0588379,0.124023,-0.200195,0.11377,0.384766,0.115723,...,0.00723267,-0.0600586,0.179688,0.155273,-0.177734,0.0334473,-0.351562,0.0151367,-0.0292969,9.540541


## Linear Regression 

In [15]:
target_column = [300] 
predictors = list(set(list(df2.columns))-set(target_column))

df2[predictors] = df2[predictors]

df2.describe()

Unnamed: 0,300
count,276.0
mean,46.801932
std,16.748445
min,0.0
25%,35.251526
50%,46.081373
75%,59.829464
max,85.918919


In [16]:
X = df2[predictors].values
y = df2[target_column].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=40)
print(X_train.shape); print(X_test.shape)

(220, 300)
(56, 300)


In [17]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [18]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

LinearRegression(n_jobs=1)

In [19]:
pred_train_lr= lr.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_lr)))
print(r2_score(y_train, pred_train_lr))

pred_test_lr= lr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_lr))) 
print(r2_score(y_test, pred_test_lr))

4.3954921229489806e-14
1.0
27.097424464454637
-1.1476186696275485


### Lasso Regression

In [20]:
model_lasso = Lasso(alpha=0.01)
model_lasso.fit(X_train, y_train) 
pred_train_lasso= model_lasso.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_lasso)))
print(r2_score(y_train, pred_train_lasso))

pred_test_lasso= model_lasso.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_lasso))) 
print(r2_score(y_test, pred_test_lasso))

2.670666315890275
0.9726449827430363
17.63488586842913
0.09040838411794205


### Ridge Regression

In [21]:
rr = Ridge(alpha=0.01)
rr.fit(X_train, y_train) 
pred_train_rr= rr.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_rr)))
print(r2_score(y_train, pred_train_rr))

pred_test_rr= rr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_rr))) 
print(r2_score(y_test, pred_test_rr))

0.23441903140129203
0.9997892422906861
25.980740369985504
-0.9742592792396711


### Elastic Net

In [22]:
model_enet = ElasticNet(alpha = 0.01)
model_enet.fit(X_train, y_train) 
pred_train_enet= model_enet.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_enet)))
print(r2_score(y_train, pred_train_enet))

pred_test_enet= model_enet.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_enet)))
print(r2_score(y_test, pred_test_enet))

4.536170616431112
0.9210819224069985
12.504803797134104
0.5426434471099484
