In [29]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import pandas as pd
import numpy as np
import random

import matplotlib.pyplot as plt
%matplotlib inline

import scipy as sp
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DecisionTree
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [2]:
def f1_genres(genre_real, genre_predict):
    ''' loss function f1_genre
    input: two pandas dataframes, 
        genre_real: predicted values
        genre_predict: real values
    output: mean f1 score of each class 
    '''
    count_row = len(genre_real)
    if count_row == 0:
        print("No data in dataframe!")
        return
    if count_row != len(genre_predict):
        print("Different length of predicted and real dataframes!")
        return
    count_col = len(genre_real.columns)
    if count_col == 0:
        print("No data in dataframe!")
        return
    if count_col != len(genre_predict.columns):
        print("Different genres of predicted and real dataframes!")
        return
    score = 0
    for i in range(count_col):
        score += f1_score(genre_real[genre_real.columns.values[i]], genre_predict[genre_predict.columns.values[i]])
    score = score/count_col
    return(score)

In [6]:
data_dir = "../Final_Data/"

In [10]:
## read in datasets
x_test = pd.read_csv(data_dir + "x_test.csv",index_col=0)
x_train = pd.read_csv(data_dir + "x_train.csv",index_col=0)
y_test = pd.read_csv(data_dir + "y_test.csv",index_col=0)
y_train = pd.read_csv(data_dir + "y_train.csv",index_col=0)

In [13]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(2500, 118)
(2500, 118)
(2500, 23)
(2500, 23)


In [32]:
x_train.columns.values

array(['year', 'rating', 'votes', 'popularity_TMDB', 'runtime_TMDB',
       'text_PC1', 'text_PC2', 'text_PC3', 'text_PC4', 'text_PC5',
       'text_PC6', 'text_PC7', 'text_PC8', 'text_PC9', 'text_PC10',
       'text_PC11', 'text_PC12', 'text_PC13', 'text_PC14', 'text_PC15',
       'text_PC16', 'text_PC17', 'text_PC18', 'text_PC19', 'text_PC20',
       'text_PC21', 'text_PC22', 'text_PC23', 'text_PC24', 'text_PC25',
       'text_PC26', 'text_PC27', 'text_PC28', 'text_PC29', 'text_PC30',
       'mpaa_PC1', 'mpaa_PC2', 'mpaa_PC3', 'mpaa_PC4', 'mpaa_PC5',
       'mpaa_PC6', 'mpaa_PC7', 'mpaa_PC8', 'mpaa_PC9', 'mpaa_PC10',
       'director0', 'director1', 'director2', 'director3', 'director4',
       'director5', 'director6', 'director7', 'director8', 'director9',
       'director10', 'director11', 'director12', 'director13',
       'director14', 'director15', 'director16', 'director17',
       'director18', 'director19', 'director20', 'director21',
       'director22', 'writer0', 'writer1

In [33]:
baseline_feature = ['year', 'rating', 'votes', 'popularity_TMDB', 'runtime_TMDB',
                   'animation department count', 'original music count', 'country_n',
                    'country_usa', 'country_france', 'country_uk', 'country_germany',
                    'country_italy', 'country_canada', 'country_japan',
                    'country_india', 'country_spain']
x_train_base = x_train[baseline_feature]
x_test_base = x_test[baseline_feature]

In [34]:
# Unweighted logistic regression
random.seed(123)

genre_pred = pd.DataFrame(index = x_test_base.index) # dataframe to store predicted values

for col in y_train.columns:
    unweighted_logistic = LogisticRegression()
    unweighted_logistic.fit(x_train_base, y_train[col])
    genre_pred[col]= unweighted_logistic.predict(x_test_base)

score_unweighted_log = f1_genres(y_test, genre_pred)
print("F1 Score:", score_unweighted_log)

F1 Score: 0.13717116396031673
