-
Notifications
You must be signed in to change notification settings - Fork 0
/
question2.py
87 lines (68 loc) · 2.31 KB
/
question2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import pandas as pd
import numpy as np
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsRegressor
import dill
class Estimator(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin):
def __init__ (self):
self.averageByCity ={}
def fit(self, df):
try:
self.averageByCity=df.groupby(by=['city'])['stars'].mean()
except:
self.averageByCity={}
return self
def predict(self,X):
try:
return self.averageByCity[X['city']]
except:
return 0
def city_model(record):
df=pd.read_csv ("./city.txt", sep="|",low_memory=False)
estimator = Estimator() # initialize
estimator.fit(df) # fit data
f=open("city_model","wb")
dill.dump(estimator, f)
f.close()
return float(estimator.predict(record))
#OK, try to use the longitude and latitude
class kEstimator(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin):
def __init__ (self):
self.neigh=KNeighborsRegressor(n_neighbors=5)
def fit(self,X, y):
self.neigh.fit(X, y)
return self
def predict(self,X):
try:
return self.neigh.predict(X)
except:
return 0
def lat_long_model(record):
df=pd.read_csv ("./location.txt", sep="|",low_memory=False)
Xsubset_np=df[['longitude','latitude']].as_matrix()
Ysubset_np=df[['stars']].as_matrix()
test=np.array([record['longitude'], record['latitude']])
q2 = kEstimator() # initialize
q2.fit(Xsubset_np,Ysubset_np) # fit data
f=open("lat_long_model","wb")
dill.dump(q2, f)
f.close()
return float(q2.predict(test))
#data=pd.read_csv ("./location.txt", sep="|",low_memory=False)
#Xsubset=data[['longitude','latitude']]
#Xsubset_np=Xsubset.as_matrix()
#Ysubset=data[['stars']]
#Ysubset_np=Ysubset.as_matrix()
X={}
X['longitude']=-90
X['latitude']= 45
print lat_long_model(X)
#city_model(X)