In [2]:
from surprise import SVD, SVDpp
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

In [3]:
reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
data = Dataset.load_from_file('data.csv', reader=reader)

## SVD

In [11]:
algo = SVD(n_epochs = 30, lr_all = 0.005, n_factors = 50)
train, test = train_test_split(data, test_size=.2)

In [12]:
algo.fit(train)
predictions = algo.test(test)

In [13]:
accuracy.rmse(predictions, verbose=True)

RMSE: 0.6950


0.6949863524353042

In [6]:
algo2 = SVDpp(n_epochs = 30, lr_all = 0.005, n_factors = 50)

In [7]:
algo2.fit(train)
predictions2 = algo2.test(test)

In [8]:
accuracy.rmse(predictions2, verbose=True)

RMSE: 0.6974


0.6973847644623044

In [15]:
user = []
movie = []
RealRate = []
EstimatedSVD = []
EstimatedSVDpp = []
for userID, movieID, actualRating, estimatedRating, _ in predictions:
    user.append(int(userID))
    movie.append(int(movieID))
    RealRate.append(actualRating)
    EstimatedSVD.append(estimatedRating)
for _,_,_,estimatedRating,_ in predictions2:
    EstimatedSVDpp.append(estimatedRating)

In [53]:
import pandas as pd
predicted = pd.DataFrame()
predicted['userId'] = user
predicted['movieId'] = movie
predicted['RealRate'] = RealRate
predicted['EstimatedSVD'] = EstimatedSVD
predicted['EstimatedSVDpp'] = EstimatedSVDpp
predicted['RealRate'] = predicted['RealRate'].astype(int)
predicted['Combination'] = predicted['EstimatedSVD'] * 0.96771722 - predicted['EstimatedSVDpp'] * 0.01791678 + 0.10381963515239212
predicted.head()

Unnamed: 0,userId,movieId,RealRate,EstimatedSVD,EstimatedSVDpp,Combination
0,554,18,2,2.207811,2.134832,2.202107
1,336,29,3,2.645088,1.80636,2.631153
2,112,158,3,2.065156,3.089469,2.046953
3,561,123,1,2.068532,2.517432,2.06047
4,445,138,3,2.23282,2.538789,2.219071


In [54]:
#from sklearn import preprocessing
#x = predicted[['Estimated']].values.astype(float)
# Create a minimum and maximum processor object
#min_max_scaler = preprocessing.MinMaxScaler(feature_range=(1,3))
#predicted['Normalized'] = min_max_scaler.fit_transform(x).astype(int)
predicted['EstimatedSVD'] = predicted['EstimatedSVD'].round().astype(int)
predicted['EstimatedSVDpp'] = predicted['EstimatedSVDpp'].round().astype(int)
predicted['Combination'] = predicted['Combination'].round().astype(int)
predicted

Unnamed: 0,userId,movieId,RealRate,EstimatedSVD,EstimatedSVDpp,Combination
0,554,18,2,2,2,2
1,336,29,3,3,2,3
2,112,158,3,2,3,2
3,561,123,1,2,3,2
4,445,138,3,2,3,2
...,...,...,...,...,...,...
14387,409,131,2,2,2,2
14388,256,54,3,2,3,2
14389,199,82,1,2,3,2
14390,149,39,3,2,2,2


## Linear Regression

In [47]:
from sklearn.linear_model import LinearRegression

In [49]:
reg = LinearRegression()
x = predicted[['EstimatedSVD', 'EstimatedSVDpp']]
y = predicted['RealRate']

In [50]:
reg.fit(x, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [51]:
reg.coef_

array([ 0.96771722, -0.01791678])

In [52]:
reg.intercept_

0.10381963515239212

## Evaluate Answer

In [55]:
(predicted['EstimatedSVDpp'] - predicted['RealRate']).map(lambda x: (x == 0)).sum()

4713

In [56]:
(predicted['EstimatedSVD'] - predicted['RealRate']).map(lambda x: (x == 0)).sum()

7053

In [57]:
(predicted['Combination'] - predicted['RealRate']).map(lambda x: (x == 0)).sum()

6956

In [58]:
predicted['RealRate'].value_counts()

3    6726
2    3952
1    3714
Name: RealRate, dtype: int64

In [59]:
def check(x, y):
    if x == y:
        return 2
    elif (x, y) == (2, 3) or (x, y) == (3, 2):
        return 1
    else:
        return 0

In [60]:
sum = 0
for i in range(len(predicted.index)):
    sum += check(predicted.loc[predicted.index[i], 'EstimatedSVD'], predicted.loc[predicted.index[i], 'RealRate'])
sum

18205

In [41]:
sum / 2 / len(predicted.index)

0.6324694274596998

In [42]:
15979 / 2 / 14394 

0.5550576629151035

In [43]:
sum2 = 0
for i in range(len(predicted.index)):
    sum2 += check(predicted.loc[predicted.index[i], 'EstimatedSVDpp'], predicted.loc[predicted.index[i], 'RealRate'])
sum2

14779

In [44]:
sum2 / 2 / len(predicted.index)

0.5134449694274597

In [45]:
14325 / 2 / 14394 

0.4976031679866611

In [61]:
sum4 = 0
for i in range(len(predicted.index)):
    sum4 += check(predicted.loc[predicted.index[i], 'Combination'], predicted.loc[predicted.index[i], 'RealRate'])
sum4

18094

In [32]:
sum4 / 2 / len(predicted.index)

0.5765008337965536

In [27]:
predicted['Random'] = 2
sum3 = 0
for i in range(len(predicted.index)):
    sum3 += check(predicted.loc[predicted.index[i], 'Random'], predicted.loc[predicted.index[i], 'RealRate'])
sum3

14630

In [18]:
predicted['RealRate'].value_counts()

3    6832
2    3958
1    3602
Name: RealRate, dtype: int64

In [19]:
16050/14412  /2

0.5568276436303081

## Read the test file + predict the answer

In [92]:
import pandas as pd

In [93]:
submission = pd.read_csv('test.csv')
submission.head()

Unnamed: 0,reviewerid,movie-code
0,45,57
1,235,185
2,507,106
3,170,36
4,469,115


In [94]:
algo.predict(str(58), str(164), verbose=True)

user: 58         item: 164        r_ui = None   est = 3.05   {'was_impossible': False}


Prediction(uid='58', iid='164', r_ui=None, est=3.0498376194380716, details={'was_impossible': False})

In [103]:
ans = []
for i in range(len(submission.index)):
    userid = submission.loc[submission.index[i], 'reviewerid']
    movieid = submission.loc[submission.index[i], 'movie-code']
    rating = algo.predict(str(userid), str(movieid))[3] * 0.5 + algo2.predict(str(userid), str(movieid))[3] * 0.5    
    ans.append(rating - 1)
submission['rating'] = ans
#submission['rating'] = submission['rating'].round().astype(int)

In [104]:
submission.loc[(submission['reviewerid'] == 45) & (submission['movie-code'] == 57)]
submission

Unnamed: 0,reviewerid,movie-code,rating
0,45,57,0.572631
1,235,185,0.674666
2,507,106,1.281019
3,170,36,1.113862
4,469,115,0.635782
...,...,...,...
9995,530,189,0.396089
9996,221,156,0.820626
9997,233,14,1.276174
9998,465,60,1.623678


In [105]:
from sklearn import preprocessing
x = submission[['rating']].values.astype(float)
# Create a minimum and maximum processor object
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,2))
submission['rating'] = min_max_scaler.fit_transform(x)

In [106]:
submission['rating'] = submission['rating'].round().astype(int)

In [108]:
submission

Unnamed: 0,reviewerid,movie-code,rating
0,45,57,0
1,235,185,1
2,507,106,1
3,170,36,1
4,469,115,1
...,...,...,...
9995,530,189,0
9996,221,156,1
9997,233,14,1
9998,465,60,1


In [109]:
submission['rating'].value_counts()

1    7903
0    1845
2     252
Name: rating, dtype: int64

In [74]:
submission['rating'].value_counts()

1    7817
0    1244
2     939
Name: rating, dtype: int64

In [110]:
submission.to_csv('submission_test.csv', index = False)