In [1]:
import pandas as pd
import re
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
#df_joined = pd.read_csv("regressionPreprocessing_short.csv")
df_joined = pd.read_csv("regressionPreprocessing.csv")
df_joined = df_joined.fillna(0)
print("Length after import: " + str(len(df_joined)))
print("Features after import: " + str(len(df_joined.columns)))

Length after import: 43872
Features after import: 130


In [3]:
# neue Reihenfolge
#cols = df_joined.columns.tolist()
#cols = cols[-1:] + cols[:-1] # letzte kommt an erster Stelle
#df_joined = df_joined[cols]

In [4]:
# meaning out 0 budgets - there are a lot, so this is better than removing the rows
df_joined['budget']=df_joined['budget'].replace(0,df_joined['budget'].mean())

### Remove Features, which are not needed

In [5]:
features_to_remove = ['director','+18','18+','actors','productionCompanies','productionCountries','imdbId','spokenLanguages','budget_norm','runtime_norm']
for i in features_to_remove:
    if i in df_joined.columns:
        df_joined = df_joined.drop(columns=i)
df_joined.head(5)

Unnamed: 0,budget,runtime,Mystery,Foreign,History,TV Movie,Crime,Family,Music,Documentary,...,Bette Davis,John Carradine,Lionel Barrymore,Charles Lane,John Wayne,Henry Fonda,Michael Caine,Boris Karloff,James Franco,Grey Griffin
0,30000000.0,81.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,65000000.0,104.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,4328343.0,101.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,16000000.0,127.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,4328343.0,106.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


### Train/test split
#### features_train + rating_train ;                   features_test + rating_test

In [6]:
from sklearn.model_selection import train_test_split

# separate features and target variable
rating = df_joined['rating'] # weight
features = df_joined.drop(columns=['rating'])

# create a train/test split
features_train, features_test, rating_train, rating_test = train_test_split(
    features, rating, test_size=0.1, random_state=42)

print("Train: " + str(len(features_train)) + " Features and " + str(len(rating_train)) + " Ratings")
print("Test: " + str(len(features_test)) + " Features and " + str(len(rating_test)) + " Ratings")

Train: 39484 Features and 39484 Ratings
Test: 4388 Features and 4388 Ratings


In [7]:
from xgboost import XGBRegressor
regressor = XGBRegressor(colsample_bytree= 0.6, gamma= 0.7, max_depth= 4, min_child_weight= 5,
                         subsample = 0.8, objective='reg:squarederror')
regressor.fit(features_train, rating_train)

importances = {}

count = 0
for feature_importance in regressor.feature_importances_:
    if feature_importance > 0.002:
        feature_name = features_train.columns[count]
        importances[feature_name] = feature_importance
    count+=1

for a in importances.items():
    print(a)

('budget', 0.00972647)
('runtime', 0.009352073)
('Mystery', 0.020576613)
('Foreign', 0.00800058)
('History', 0.021636745)
('TV Movie', 0.010740736)
('Crime', 0.017331397)
('Family', 0.0121769775)
('Music', 0.027358606)
('Documentary', 0.15778288)
('Action', 0.027183298)
('Fantasy', 0.0143706715)
('War', 0.008272914)
('Animation', 0.019134324)
('Thriller', 0.030215187)
('Science Fiction', 0.039088923)
('Drama', 0.12015697)
('Adventure', 0.009992758)
('Romance', 0.012119901)
('Horror', 0.39320117)
('Comedy', 0.011054753)
('Western', 0.0057188934)
('part_of_collection', 0.00679045)
('hasHomepage', 0.008016672)


### Remove features which are not important enough

In [8]:
rating = df_joined['rating']

for i in df_joined.columns:
    if i not in importances:
        df_joined = df_joined.drop(columns=i)

print("Features: ", len(df_joined.columns))
df_joined.head(5)

features = df_joined
# create a train/test split
features_train, features_test, rating_train, rating_test = train_test_split(
    features, rating, test_size=0.1, random_state=42)

print("Train: " + str(len(features_train)) + " Features and " + str(len(rating_train)) + " Ratings")
print("Test: " + str(len(features_test)) + " Features and " + str(len(rating_test)) + " Ratings")

Features:  24
Train: 39484 Features and 39484 Ratings
Test: 4388 Features and 4388 Ratings


In [9]:
from sklearn.linear_model import LinearRegression
lm2 = LinearRegression()
lm2.fit(features_train, rating_train)

# print the coefficients
print(lm2.intercept_)
print(lm2.coef_)

print(list(zip(features_train.columns, lm2.coef_)))

2.931156899673925
[ 7.81635983e-10  3.29050663e-04  9.61352717e-02 -3.67269799e-02
  1.41236519e-01  5.45261807e-02  3.19881227e-02 -9.69628173e-02
  7.45756217e-02  4.16282607e-01 -1.11968879e-01  6.64118934e-02
  5.80847780e-02  1.74059401e-01 -3.31924503e-02 -1.26737171e-01
  2.23198899e-01 -9.41053905e-03  1.91587787e-02 -4.15132345e-01
  3.34635197e-02  4.25870242e-02 -3.65027167e-03 -2.03660929e-02]
[('budget', 7.816359831492934e-10), ('runtime', 0.00032905066317639987), ('Mystery', 0.0961352716614105), ('Foreign', -0.03672697991095928), ('History', 0.14123651866207088), ('TV Movie', 0.05452618071029435), ('Crime', 0.031988122743234766), ('Family', -0.09696281732620733), ('Music', 0.07457562174999581), ('Documentary', 0.41628260668811634), ('Action', -0.11196887919313202), ('Fantasy', 0.06641189336532126), ('War', 0.05808477798941179), ('Animation', 0.17405940131822234), ('Thriller', -0.03319245031518115), ('Science Fiction', -0.126737170687742), ('Drama', 0.22319889907487012), (

In [10]:
from sklearn import metrics
import numpy as np

lm2 = LinearRegression()
lm2.fit(features_train,rating_train)
y_pred = lm2.predict(features_test)

# calculate MAE, MSE, RMSE
print("MAE: ", metrics.mean_absolute_error(rating_test, y_pred))
print("MSE: ", metrics.mean_squared_error(rating_test, y_pred))
print("RMSE", np.sqrt(metrics.mean_squared_error(rating_test, y_pred)))

# MSE is more popular than MAE because MSE "punishes" larger errors. But, RMSE is even more popular than MSE because RMSE is interpretable in the "y" units

MAE:  0.5110551070770686
MSE:  0.4815075136463717
RMSE 0.6939074244064346


### Reduce Dimensions with PCA (data visualization)
#### principalDf contains the features in only 2 dimensions

In [11]:
# # Separating out the features
# x = features
# y = rating

# # Standardize the Data
# from sklearn.preprocessing import StandardScaler
# x = StandardScaler().fit_transform(x)

# from sklearn.decomposition import PCA
# pca = PCA(n_components=2)# X dimensions to 2
# principalComponents = pca.fit_transform(x)
# principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])

# finalDf = pd.concat([principalDf, df_joined[['rating']]], axis = 1)

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit on training set only
scaler.fit(features_train)

# Apply transform to both the training set and the test set.
train_features = scaler.transform(features_train)
test_features = scaler.transform(features_test)

from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.95)

pca.fit(train_features)

train_features = pca.transform(train_features)
test_features = pca.transform(test_features)

print("Train: " + str(len(train_features)) + " Features and " + str(len(rating_train)) + " Ratings")
print("Test: " + str(len(test_features)) + " Features and " + str(len(rating_test)) + " Ratings")

Train: 39484 Features and 39484 Ratings
Test: 4388 Features and 4388 Ratings


#### RIDGE

In [13]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

reg = linear_model.Ridge(alpha=.5)
reg.fit(features_train,rating_train)

#print(reg.coef_)       # coefficients w
print(reg.intercept_)

print("#################")

lr = LinearRegression()
lr.fit(features_train, rating_train)

rr = Ridge(alpha=0.01) # higher the alpha value, more restriction on the coefficients; low alpha > more generalization, coefficients are barely
# restricted and in this case linear and ridge regression resembles
rr.fit(features_train, rating_train)

rr100 = Ridge(alpha=100) #  comparison with alpha value
rr100.fit(features_train, rating_train)

train_score=lr.score(features_train, rating_train)
test_score=lr.score(features_test, rating_test)

Ridge_train_score = rr.score(features_train,rating_train)
Ridge_test_score = rr.score(features_test, rating_test)

Ridge_train_score100 = rr100.score(features_train,rating_train)
Ridge_test_score100 = rr100.score(features_test, rating_test)

print("linear regression train score:", train_score)
print("linear regression test score:", test_score)
print("ridge regression train score low alpha:", Ridge_train_score)
print("ridge regression test score low alpha:", Ridge_test_score)
print("ridge regression train score high alpha:", Ridge_train_score100)
print("ridge regression test score high alpha:", Ridge_test_score100)

plt.plot(rr.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Ridge; $\alpha = 0.01$',zorder=7) # zorder for ordering the markersplt.plot(rr100.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Ridge; $\alpha = 100$') # alpha here is for transparencyplt.plot(lr.coef_,alpha=0.4,linestyle='none',marker='o',markersize=7,color='green',label='Linear Regression')plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)
plt.show()

NameError: name 'linear_model' is not defined

In [None]:
from sklearn.linear_model import Lasso

print("### Lasso Standard (Alpha=1) ###")

lasso = Lasso()
lasso.fit(features_train,rating_train)
train_score=lasso.score(features_train,rating_train)
test_score=lasso.score(features_test,rating_test)
coeff_used = np.sum(lasso.coef_!=0)

print("training score: ", train_score )
print("test score: ", test_score)
print("number of features used: ", coeff_used)

print("\n### Lasso (low Alpha = 0.01) ###")

lasso001 = Lasso(alpha=0.01, max_iter=10e5)
lasso001.fit(features_train,rating_train)
train_score001=lasso001.score(features_train,rating_train)
test_score001=lasso001.score(features_test,rating_test)
coeff_used001 = np.sum(lasso001.coef_!=0)

print("training score for alpha=0.01:", train_score001)
print("test score for alpha =0.01: ", test_score001)
print("number of features used: for alpha =0.01:", coeff_used001)

print("\n### Lasso (very low Alpha = 0.0001) ###")

lasso00001 = Lasso(alpha=0.0001, max_iter=10e5)
lasso00001.fit(features_train,rating_train)
train_score00001=lasso00001.score(features_train,rating_train)
test_score00001=lasso00001.score(features_test,rating_test)
coeff_used00001 = np.sum(lasso00001.coef_!=0)

print("training score for alpha=0.0001:", train_score00001)
print("test score for alpha =0.0001: ", test_score00001)
print("number of features used: for alpha =0.0001:", coeff_used00001)

print("\n### Linear Regression ###")

lr = LinearRegression()
lr.fit(features_train,rating_train)
lr_train_score=lr.score(features_train,rating_train)
lr_test_score=lr.score(features_test,rating_test)

print("LR training score:", lr_train_score)
print("LR test score: ", lr_test_score)

print("\n### Visualisierung ###")

plt.subplot(1,2,1)
plt.plot(lasso.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Lasso; $\alpha = 1$',zorder=7) # alpha here is for transparency
plt.plot(lasso001.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Lasso; $\alpha = 0.01$') # alpha here is for transparency

plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)
plt.subplot(1,2,2)

plt.plot(lasso.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Lasso; $\alpha = 1$',zorder=7) # alpha here is for transparency
plt.plot(lasso001.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Lasso; $\alpha = 0.01$') # alpha here is for transparency
plt.plot(lasso00001.coef_,alpha=0.8,linestyle='none',marker='v',markersize=6,color='black',label=r'Lasso; $\alpha = 0.00001$') # alpha here is for transparency
plt.plot(lr.coef_,alpha=0.7,linestyle='none',marker='o',markersize=5,color='green',label='Linear Regression',zorder=2)

plt.xlabel('Coefficient Index',fontsize=16)
plt.ylabel('Coefficient Magnitude',fontsize=16)
plt.legend(fontsize=13,loc=4)
plt.tight_layout()
plt.show()

In [None]:
from sklearn import linear_model

print("### Bayesian Ridge Regression ###")
reg = linear_model.BayesianRidge()
reg.fit(features_train,rating_train)
train_score=reg.score(features_train,rating_train)
test_score=reg.score(features_test,rating_test)
#coeff_used = np.sum(reg.coef_!=0)

print("training score: ", train_score )
print("test score: ", test_score)
print("number of features used: ", coeff_used)

In [None]:
from sklearn.model_selection import train_test_split

# separate features and target variable
rating = df_joined['rating'] # weight
features = df_joined.drop(columns=['rating'])

# create a train/test split
features_train, features_test, rating_train, rating_test = train_test_split(
    features, rating, test_size=0.1, random_state=42)

print("Train: " + str(len(features_train)) + " Features and " + str(len(rating_train)) + " Ratings")
print("Test: " + str(len(features_test)) + " Features and " + str(len(rating_test)) + " Ratings")

In [None]:
print("### MLP Regressor (multilayer neuronal network) ###")
from sklearn.neural_network import MLPRegressor
reg = MLPRegressor(hidden_layer_sizes=(15,),
            activation='relu', # 'logistic' / 'relu'
            solver='sgd', # 'sgd'/'adam'
            learning_rate='adaptive', # default ‘constant’
            max_iter=10000,
            learning_rate_init=0.01,
            alpha=0.0001) # default 0.0001
reg.fit(features_train,rating_train)
train_score=reg.score(features_train,rating_train)
test_score=reg.score(features_test,rating_test)
#coeff_used = np.sum(reg.coefs_!=0)

print("training score: ", train_score )
print("test score: ", test_score)
print("loss: ", reg.loss_)