In [None]:
import pandas as pd
import numpy as np
import ast

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train.genres = train.genres.apply(ast.literal_eval)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import  MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(train.genres)
Y = multilabel_binarizer.transform(train.genres)

tfidf_vectorizer = TfidfVectorizer(min_df = 10)
X_tfidf = tfidf_vectorizer.fit_transform(train.dialogue)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import Perceptron

In [None]:
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, 
                                                                               Y, test_size = 0.2, random_state = 1)

In [None]:
rfc = OneVsRestClassifier(Perceptron(), n_jobs=-1)

y_pred = np.empty((7399,20))
rfc.fit(x_train_tfidf,y_train_tfidf)
y_pred = rfc.predict(x_test_tfidf)
print(f1_score(y_test_tfidf, y_pred, average='samples'))


0.5722729973844991


In [None]:
lr = LogisticRegression(max_iter=1000, multi_class='multinomial')
svc = LinearSVC()

y_pred = np.empty((7399,20))
lr_ovr = OneVsRestClassifier(lr)
lr_ovr.fit(x_train_tfidf, y_train_tfidf)
y_pred_proba = lr_ovr.predict_proba(x_test_tfidf)
for i in range(7399):
  for j in range(20):
    if y_pred_proba[i][j] >= 0.3:
      y_pred[i][j] = 1
    else:
      y_pred[i][j] = 0
print(f1_score(y_test_tfidf, y_pred, average='samples'))

0.6060228001639004   0
0.6362337681840318   1
0.649591536393807   2
0.6561915273835803   3
0.6599764661462187   4
0.662382411179546   5
0.6642787420865539   6
0.6647865327146311   7
0.6669039359673229   8
0.6669596062960031   9
0.6670928289322665   10
0.668270916061158   11
0.6676665872908608   12
0.6670365150159716   13
0.6664553553998074   14
0.6665345166764277   15
0.6663900312569052   16
0.6661428936128219   17
0.6660407777112737   18
0.6662093976663512   19
0.6661180082250497   20
0.6660401341236588   21
0.6660082765367263   22
0.665779481139665   23
0.6654078092921181   24
0.6652327534608924   25
0.6646116914126105   26
0.6641219212377478   27
0.6638516144395318   28
0.6627903384627266   29
0.6618362198237857   30
0.6614524270761601   31
0.6609034468407358   32
0.6603985523569251   33
0.6605684594872322   34
0.6599901960153345   35
0.6598943014607294   36
0.6596854572797267   37
0.6593312695623819   38
0.659031357733885   39
0.658693474236115   40
0.6584386135406542   41
0.658189

In [None]:
lr_ovr.fit(X_tfidf, Y)
svc_ovr.fit(X_tfidf, Y)

OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                        fit_intercept=True, intercept_scaling=1,
                                        loss='squared_hinge', max_iter=1000,
                                        multi_class='ovr', penalty='l2',
                                        random_state=None, tol=0.0001,
                                        verbose=0),
                    n_jobs=None)

In [None]:
X_test = tfidf_vectorizer.transform(test.dialogue)

In [None]:
y_pred_svc = svc_ovr.predict(X_test)
y_pred_proba_lr = lr_ovr.predict_proba(X_test)
y_pred_lr = np.empty((9403,20))
for i in range(9403):
  for j in range(20):
    if y_pred_proba_lr[i][j] >= 0.3:
      y_pred_lr[i][j] = 1
    else:
      y_pred_lr[i][j] = 0

In [None]:
Y_svc = multilabel_binarizer.inverse_transform(y_pred_svc)
Y_lr = multilabel_binarizer.inverse_transform(y_pred_lr)

In [None]:
res_svc = pd.DataFrame({'genres':Y_svc})
res_lr = pd.DataFrame({'genres':Y_lr})

In [None]:
res_lr['genres'] = [' '.join(map(str, l)) for l in res_lr['genres']]
res_svc['genres'] = [' '.join(map(str, l)) for l in res_svc['genres']]

In [None]:
res_lr.to_csv('res_lr_multi.csv')
res_svc.to_csv('res_svc.csv')