In [1]:
import sys
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_svmlight_file
from sklearn.datasets import dump_svmlight_file
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.svm import SVC         
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
import matplotlib.pylab as plt
from sklearn.model_selection import GridSearchCV
from pandas import DataFrame

def loadTrainFile():
    tmp = np.loadtxt("train.csv", dtype=np.str, delimiter=",")
    userID = tmp[1:,0].astype(int)
    item1 = tmp[1:,1].astype(int)
    item2 = tmp[1:,2].astype(int)
    labels = tmp[1:,3].astype(int)
    return userID, item1, item2, labels
def loadTestFile():
    tmp = np.loadtxt("test.csv", dtype=np.str, delimiter=",")
    userID = tmp[1:,0].astype(int)
    item1 = tmp[1:,1].astype(int)
    item2 = tmp[1:,2].astype(int)
    return userID, item1, item2
def loadUserFile():
    tmp = np.loadtxt("users.csv", dtype=str, delimiter=",")
    return tmp[1:,1:]

def loadItemFile():
    tmp = np.loadtxt("items.csv", dtype=np.str, delimiter=",")
    return tmp[1:,1:]





In [2]:
########### Data ###########
user_dic = loadUserFile()
item_dic = loadItemFile()
userID, item1, item2, labels = loadTrainFile()

preference = np.zeros([len(user_dic),10], dtype=int)

for idx, label in enumerate(labels):
    if(label==0):
        preference[userID[idx]-1][item1[idx]-1]+=1
#         preference[userID[idx]-1][item2[idx]-1]-=1
    else:
#         preference[userID[idx]-1][item1[idx]-1]-=1
        preference[userID[idx]-1][item2[idx]-1]+=1

preference = preprocessing.scale(preference, axis=1, copy=False)
X_train = []
y_train = []
for i in range(len(user_dic)):
    for j in range(len(item_dic)):
        X_train.append(np.concatenate([user_dic[i].astype(float),item_dic[j].astype(float)]))
        y_train.append(preference[i][j])
X_train = np.array(X_train)
X_train = preprocessing.scale(X_train, axis=0, copy=False)
y_train = np.array(y_train).reshape(len(y_train))
print(X_train)
print(y_train)
########### Data ###########

[[ 0.72166713  1.47393266  0.97112381 ... -1.22474487 -1.28527737
   0.81649658]
 [ 0.72166713  1.47393266  0.97112381 ...  0.81649658  1.33773767
  -1.22474487]
 [ 0.72166713  1.47393266  0.97112381 ... -1.22474487  0.46339932
   0.81649658]
 ...
 [ 0.72166713 -0.34948919  0.97112381 ...  0.81649658 -1.28527737
  -1.22474487]
 [ 0.72166713 -0.34948919  0.97112381 ...  0.81649658 -0.41093902
   0.81649658]
 [ 0.72166713 -0.34948919  0.97112381 ...  0.81649658  0.46339932
   0.81649658]]
[-0.19324699 -0.83740361  1.73922289  0.45090964  1.09506626 -0.83740361
 -1.48156024 -0.83740361 -0.19324699  1.09506626  0.37907125 -0.16245911
  1.46213197 -0.70398947 -0.70398947 -0.70398947  2.00366234 -0.70398947
  0.37907125 -1.24551983 -0.60547036  1.25751537 -0.13972393 -0.60547036
 -1.0712168   0.3260225   2.18900823  0.3260225  -0.60547036 -1.0712168
  0.41758499 -0.77551498  2.20723495  0.41758499  0.41758499 -1.37206497
 -0.178965   -1.37206497  0.41758499 -0.178965   -0.22298824  0.5203059



In [3]:
########### Model ###########

# # Split data into training data and testing data
X_train_fold, X_test_fold, y_train_fold, y_test_fold = train_test_split(X_train, y_train, test_size=0.1, random_state=0)

linearRegres = LinearRegression()
linearRegres.fit(X_train_fold,y_train_fold)

########## Validate ##########
hit = 0

X_train_item1 = np.concatenate([user_dic[userID-1].astype(float), item_dic[item1-1].astype(float)], axis=1)
X_train_item2 = np.concatenate([user_dic[userID-1].astype(float), item_dic[item2-1].astype(float)], axis=1)
X_train_item1 = preprocessing.scale(X_train_item1, axis=0, copy=False)
X_train_item2 = preprocessing.scale(X_train_item2, axis=0, copy=False)
user_preference_item1 = linearRegres.predict(X_train_item1)
user_preference_item2 = linearRegres.predict(X_train_item2)

for idx in range(userID.shape[0]):
    entry = str(int(userID[idx]))+'-'+str(int(item1[idx]))+'-'+str(int(item2[idx]))
    if(labels[idx]==0 and user_preference_item1[idx]>=user_preference_item2[idx]):
        hit+=1
    elif(labels[idx]==1 and user_preference_item1[idx]<user_preference_item2[idx]):
        hit+=1
print(userID.shape[0])

print("hit %: " , hit/userID.shape[0])



1364
hit %:  0.6407624633431085


In [4]:
########### Model ###########

# # Split data into training data and testing data
X_train_fold, X_test_fold, y_train_fold, y_test_fold = train_test_split(X_train, y_train, test_size=0.05, random_state=0)

ridge = Ridge(alpha = .1)
ridge.fit(X_train_fold,y_train_fold)
print(X_train_fold)
print(y_train_fold)
########## Validate ##########
hit = 0

X_train_item1 = np.concatenate([user_dic[userID-1].astype(float), item_dic[item1-1].astype(float)], axis=1)
X_train_item2 = np.concatenate([user_dic[userID-1].astype(float), item_dic[item2-1].astype(float)], axis=1)
X_train_item1 = preprocessing.scale(X_train_item1, axis=0, copy=False)
X_train_item2 = preprocessing.scale(X_train_item2, axis=0, copy=False)
# print(X_train_item1)
# print(X_train_item2)
user_preference_item1 = ridge.predict(X_train_item1)
user_preference_item2 = ridge.predict(X_train_item2)

for idx in range(userID.shape[0]):
    entry = str(int(userID[idx]))+'-'+str(int(item1[idx]))+'-'+str(int(item2[idx]))
    if(labels[idx]==0 and user_preference_item1[idx]>user_preference_item2[idx]):
        hit+=1
    elif(labels[idx]==1 and user_preference_item1[idx]<=user_preference_item2[idx]):
        hit+=1
print(userID.shape[0])

print("hit %: " , hit/userID.shape[0])

[[ 0.72166713  1.47393266  0.97112381 ...  0.81649658 -0.41093902
   0.81649658]
 [-3.21469902 -0.34948919 -2.78806513 ...  0.81649658  0.46339932
   0.81649658]
 [ 0.72166713  0.56222174 -0.90847066 ...  0.81649658 -0.41093902
  -1.22474487]
 ...
 [ 0.72166713 -0.34948919 -0.90847066 ...  0.81649658  0.46339932
   0.81649658]
 [ 0.72166713  0.56222174  0.97112381 ... -1.22474487  0.46339932
   0.81649658]
 [ 0.72166713  1.47393266  0.97112381 ...  0.81649658  0.46339932
   0.81649658]]
[-1.57142857 -1.34839972  2.14301554 -0.70398947 -1.28373482 -0.09805807
 -1.21052632 -0.12038585 -0.15789474 -0.83740361 -0.14285714  1.62746694
 -0.58079717  1.26360005 -0.72558924  1.90443316  0.97986371 -0.19324699
 -0.83740361 -1.32569796 -0.70398947 -0.72558924  1.25751537 -1.28373482
 -1.48156024  0.3907019  -0.96628239 -1.1783257  -1.37206497 -0.77551498
  1.01413498  2.06513862  0.80952381  1.28571429 -0.63599873  0.52941176
  0.36842105 -0.14285714  0.43328912 -1.37206497  0.45090964  0.390701

In [None]:
########## Test ##########
print("########## Start Test ##########")

userID, item1, item2 = loadTestFile()

X_test_item1 = np.concatenate([user_dic[userID-1].astype(float), item_dic[item1-1].astype(float)], axis=1)
X_test_item2 = np.concatenate([user_dic[userID-1].astype(float), item_dic[item2-1].astype(float)], axis=1)
user_preference_item1 = svm.predict(X_test_item1)
user_preference_item2 = svm.predict(X_test_item2)

test_output=[['User-Item1-Item2','Preference']]
for idx in range(userID.shape[0]):
    entry = str(int(userID[idx]))+'-'+str(int(item1[idx]))+'-'+str(int(item2[idx]))
    if(user_preference_item1[idx]>=user_preference_item2[idx]):
        value=0
    else:
        value=1
    test_output.append([entry,value])
print(test_output)
np.savetxt("output.csv", np.array(test_output, dtype=np.str), fmt='%s,%s', delimiter=",")
# print(X_test.shape)
# print(X_test.astype(float))
# for idx, label in enumerate(labels):
# #     print(user_preference[userID[idx]*item1[idx]-1])
# #     print(user_preference[userID[idx]*item2[idx]-1])
#     if(label==0 and (user_preference[userID[idx]*item1[idx]-1]>=user_preference[userID[idx]*item2[idx]-1])):
#         hit+=1
#     elif(label==1 and (user_preference[userID[idx]*item1[idx]-1]<=user_preference[userID[idx]*item2[idx]-1])):
#         hit+=1
        
# test_output=[['User-Item1-Item2','Preference']]
# for idx in range(pridict_output.shape[0]):
#     entry = str(int(userID[idx]))+'-'+str(int(item1[idx]))+'-'+str(int(item2[idx]))
#     value = pridict_output[idx]
#     test_output.append([entry,value])

# print(test_output)
# np.savetxt("output.csv", np.array(test_output, dtype=np.str), fmt='%s,%s', delimiter=",")