In [1]:
from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/My Drive/University/ΕΞΑΜΗΝΟ 7/ΑΝΑΚΤΗΣΗ ΠΛΗΡΟΦΟΡΙΑΣ/Project/data"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%load_ext google.colab.data_table

In [3]:
!pip install elasticsearch



In [4]:
import numpy as np
import pandas as pd
import re
from elasticsearch import Elasticsearch


pd.options.mode.chained_assignment = None

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.preprocessing import LabelEncoder


In [5]:
movies = pd.read_csv("{}/movies.csv".format(path))
reviews = pd.read_csv("{}/ratings.csv".format(path))
tf_idf_pre = pd.read_csv("./temp.csv".format(path))

movies["title_m"] = movies["title"].apply(lambda x: x[:-6])

# **TFidf**

In [6]:
es = Elasticsearch([{'host': 'snf-874957.vm.okeanos.grnet.gr', 'port': 9200}])

In [7]:
def TfIdf(corpus, N):

    data = {(i,j):corpus[i][j] for i in corpus.keys() for j in corpus[i].keys()}

    df = pd.DataFrame.from_dict(data, orient='index').sort_index()
    
    df["tf-idf"] = np.multiply(df["term_freq"], np.log( N / df["doc_freq"]))

    df.index.names = ["movieId", "word"]
    
    tfidf = df.reset_index()
    tfidf.drop(["term_freq","doc_freq"],axis=1, inplace=True)
    tfidf.movieId = tfidf.movieId.astype("int64")
    tfidf["tf-idf"] = tfidf["tf-idf"].astype("float32")

    return tfidf

In [8]:
res = es.search(index="movies_test", body={"query": {"match_all":{} }}, size = 10000)

In [9]:
ids = dict()
corpus = dict()
word_params_dict = dict()
total_hits = res['hits']['total']

for hit in range(total_hits):
    movieId = res['hits']['hits'][hit]['_source']['movieId']
    es_id = res['hits']['hits'][hit]['_id']
    ids[es_id] = movieId

In [10]:
for movie in es.mtermvectors(index="movies_test",doc_type='_doc',body=dict(ids=list(ids.keys()),parameters=dict(term_statistics=True,field_statistics=True,fields=["title"])))['docs']:                                                                                                                                                            
    
    title = movie['term_vectors'] 
    movieId = ids.get( movie['_id'], None ) 
    words = title['title']['terms'].keys()

    for word in words:
        term_freq = title['title']['terms'][word]['term_freq']
        doc_freq = title['title']['terms'][word]['doc_freq']
        word_params_dict[word] = {'term_freq': term_freq, 'doc_freq' : doc_freq}

    corpus[movieId] = word_params_dict
    word_params_dict = dict()

In [11]:
tf_idf_pre = TfIdf(corpus, len(corpus))

In [12]:
tf_idf_pre

Unnamed: 0,movieId,word,tf-idf
0,1,1995,5.007899
1,1,story,6.553824
2,1,toy,8.425626
3,2,1995,5.007899
4,2,jumanji,9.118773
...,...,...,...
38177,164977,the,2.677827
38178,164979,69,9.118773
38179,164979,of,3.877026
38180,164979,unboxed,9.118773


# **Data Proccess**

In [13]:
temp = movies[['movieId',"title","genres"]].set_index('movieId').genres.str.split('|', expand=True).stack()
genres = pd.get_dummies(temp).groupby(level=0).sum().reset_index()

In [14]:
reviews['class'] = reviews['rating'].apply(lambda x: int(x*2))

In [15]:
tf_idf_pre = tf_idf_pre[["movieId",'word',"tf-idf"]]

In [16]:
tf_idf_pre["tf-idf"] = tf_idf_pre["tf-idf"].astype('float32')

In [17]:
tf_idf = tf_idf_pre.pivot(index='movieId', columns='word', values='tf-idf').reset_index().fillna(0)

In [18]:
library = genres.merge(tf_idf, how='left', on='movieId')

In [19]:
extra_data = reviews[['class',"userId","movieId"]].merge(genres, on='movieId', how='left')

In [20]:
extra_data["class"] = extra_data["class"].astype('category')

In [21]:
final = extra_data.merge(tf_idf, how='left', on='movieId')

In [22]:
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100004 entries, 0 to 100003
Columns: 9184 entries, class_x to über
dtypes: category(1), float32(9161), int64(2), uint8(20)
memory usage: 3.4 GB


# Train Model

In [23]:
accs = []

for i in final.userId.unique():

    temp = final[final.userId == i]
    y = temp['class_x']
    temp.drop(columns=["class_x","movieId","userId"], inplace=True)
    X_train, X_test, y_train, y_test = train_test_split(temp, y, test_size=0.2, random_state=20)

    clf = SGDClassifier(max_iter=300, tol=1e-3)
    #Train the model using the training sets y_pred=clf.predict(X_test)
    clf.fit(X_train,y_train)

    y_pred = clf.predict(X_test)
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

    accs.append(metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.0
Accuracy: 0.3125
Accuracy: 0.2727272727272727
Accuracy: 0.4634146341463415
Accuracy: 0.25
Accuracy: 0.3333333333333333
Accuracy: 0.4444444444444444
Accuracy: 0.20833333333333334
Accuracy: 0.3333333333333333
Accuracy: 0.5
Accuracy: 0.25
Accuracy: 0.23076923076923078
Accuracy: 0.36363636363636365
Accuracy: 0.5
Accuracy: 0.17058823529411765
Accuracy: 0.16666666666666666
Accuracy: 0.1232876712328767
Accuracy: 0.09090909090909091
Accuracy: 0.4823529411764706
Accuracy: 0.25
Accuracy: 0.45454545454545453
Accuracy: 0.11363636363636363
Accuracy: 0.0821917808219178
Accuracy: 0.2
Accuracy: 0.16666666666666666
Accuracy: 0.08571428571428572
Accuracy: 0.2
Accuracy: 0.7
Accuracy: 0.0
Accuracy: 0.3497536945812808
Accuracy: 0.2857142857142857
Accuracy: 0.2
Accuracy: 0.14285714285714285
Accuracy: 0.47368421052631576
Accuracy: 0.25
Accuracy: 0.42857142857142855
Accuracy: 0.42857142857142855
Accuracy: 0.43478260869565216
Accuracy: 0.23076923076923078
Accuracy: 0.4444444444444444
Accuracy: 0.

In [24]:
pd.DataFrame(accs).describe()

Unnamed: 0,0
count,671.0
mean,0.284188
std,0.179288
min,0.0
25%,0.166667
50%,0.25
75%,0.4
max,0.875


The accuracy is very low but on average is threefold the one of a dummy classifier ( random ). The data even when reduced are very big to train as one in our machine

# **Lets Predict**

In [None]:
predictions = []
for i in final.userId.unique():

    user_data = final[final.userId == i]
    y = user_data['class_x']
    user_data.drop(columns=["class_x","movieId","userId"], inplace=True)

    clf = SGDClassifier(max_iter=300, tol=1e-3)
    clf.fit(user_data,y)

    user_i = library.drop(columns=["movieId"], inplace=False)
    user_predictions = clf.predict(user_i)
    predictions.append(user_predictions.tolist())



In [None]:
q4 = pd.DataFrame(predictions, columns=library['movieId'])

In [None]:
q4["userId"] = final.userId.unique()

In [None]:
q4.to_csv("predictions.csv", index=False)