# Practice №11
## by Anton Kondrashov
##### HSE, CS, BSE 141(1)

### Dependencies

In [79]:
%precision 6
%load_ext line_profiler
%load_ext autoreload
%autoreload 1

import sklearn
import pandas as pd
import numpy as np
import codecs
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing

pd.options.display.max_colwidth=100
np.set_printoptions(linewidth=140,edgeitems=10)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Datasets

In [80]:
def unpack_dataset(path, to=None):
    dicts = []
    
    if to is not None:
        dicts = list(to.to_dict().values())
        
    with open(path) as f:
        for line in f:
            single_dict = eval(line)
            dicts.append(single_dict)
    
    return pd.DataFrame(dicts)

def labelize(column_name, dataset):
    le = preprocessing.LabelEncoder()
    le.fit(dataset[column_name])
    dataset[column_name] = le.transform(dataset[column_name])
    
def process_runtime(time):
    if re.findall("\d*", time)[0] == '':
        return 0 
    else:
        return int(re.findall("\d*", time)[0])

# def replace_dates(dataset):
#     extension_set = pd.DataFrame(columns=['year', 'month', 'day'])
#     date['time'].apply(lambda x: x[:3])

In [81]:
main_path = '/Users/kondranton/Desktop/Datasets'
movie_descriptions = unpack_dataset(main_path + '/movie_descriptions.txt')

feedbacks = []
with open(main_path + '/train_feedback.txt') as f:
        for line in f:
            feedback = eval(line)
            feedbacks.append(feedback)

In [82]:
with open(main_path + '/test_feedback.txt') as f:
        for line in f:
            feedback = eval(line)
            feedbacks.append(feedback)

In [83]:
feedbacks = pd.DataFrame(feedbacks)

### Preprocessing

In [84]:
# desc
movie_descriptions['runtime'] = movie_descriptions['runtime'].apply(process_runtime)
labelize('rated', movie_descriptions)
labelize('type', movie_descriptions)
labelize('genre', movie_descriptions)
labelize('language', movie_descriptions)
labelize('time', feedbacks)

In [38]:
user_avarage_scores = {}

for user_id in feedbacks['UserId'].unique():
    user_feedbacks = feedbacks.loc[feedbacks['UserId'] == user_id]
    avarage_score = sum(list(user_feedbacks['score'].values)) / len(user_feedbacks['score'])
    user_avarage_scores[user_id] = int(round(avarage_score))

feedbacks['score'] = feedbacks[['UserId', 'score']].apply(lambda row: row['score'] - user_avarage_scores[row['UserId']], axis=1)

In [85]:
Z=pd.merge(feedbacks,movie_descriptions,on='MovieId')

In [86]:
np.random.seed(0)
Z = Z.sample(frac=1.0).reset_index(drop=True)

In [87]:
Z.rename(columns={'score':'y'}, inplace=True)

In [88]:
corpus = Z.text #+ Z.synopsis + Z.writer + Z.summary + Z.director

In [89]:
vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
X = vectorizer.fit_transform(corpus)
Y=Z.y.values

In [91]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=True)
X1 = v.fit_transform(Z[['UserId', 'rated', 'runtime', 'year']].to_dict(orient='records'))

In [92]:
from scipy.sparse import hstack
X = hstack([X1, X])#.toarray()

In [93]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)
X = scaler.fit_transform(X)

In [47]:
np.random.seed(0)
index = np.arange(len(Z))
mask = index < 0.6 * len(Z)

In [48]:
Xtr = X.tocsr()[mask]
Ytr = Y[mask]
Xte = X.tocsr()[~mask]
Yte = Y[~mask]

In [94]:
train_sels = ~np.isnan(Z.y)
test_sels = np.isnan(Z.y)

train_inds = np.where(train_sels)
test_inds = np.where(test_sels)

del train_sels, test_sels

In [None]:
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

# clf = LinearSVC(C=1)
clf = LogisticRegressionCV(multi_class='ovr')
clf.fit(X[train_inds], Y[train_inds])
# clf.fit(Xtr,Ytr)

In [None]:
clf.best_params_

In [None]:
Y_hat = clf.predict(Xte)

In [None]:
Ytr

In [None]:
Yte

In [None]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

rmse(Y_hat, Yte)

In [None]:
#output
Y_hat = clf.predict(X[test_inds])
content = pd.concat([pd.DataFrame(Z.id.values[test_inds]), pd.DataFrame(Y_hat)], axis=1)
content.columns = ['ReviewId', 'Score']
solution = pd.DataFrame(content, dtype=int)
solution.to_csv(main_path + '/solution.csv',index=False)

In [None]:
param_grid = {'C': [0.1, 0.03, 1, 3, 5, 7]}
clf = sklearn.model_selection.GridSearchCV(sklearn.svm.LinearSVC(),param_grid, n_jobs=-1)
# clf.fit(X[train_inds], Y[train_inds])
clf.fit(X,Y)