### Imports

In [None]:
import pandas as pd
import matplotlib
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

%matplotlib inline
nltk.download('punkt')
nltk.download('stopwords')

### Read data

In [None]:
business = pd.read_json('business.json', lines=True)

In [None]:
review = pd.read_json('review.json', lines=True, chunksize=100000)

In [None]:
business.city.value_counts()[0:10].plot(kind='bar')

In [None]:
city_business = business[business.city == 'Pittsburgh']
city_business_ids = pd.DataFrame(city_business.business_id)

### Merge Data

In [None]:
temp = []
for i in review:
    temp.append(city_business_ids.merge(i.drop('stars', axis=1), on='business_id', how='left'))
merged = pd.concat(temp)

### Format data matrix
The goal of the model is to predict business stars based on review text
want a matrix with business stars and text for all reviews
use nltk to do modeling and sentiment analysis

In [None]:
merged.drop(['user_id',
             'review_id',
             'useful',
             'funny',
             'cool',
             'date'],
            axis=1,
            inplace=True)
merged = merged[pd.notnull(merged.text)]
merged.reset_index(drop=True, inplace=True)

In [None]:
merged.info()

### Build word buckets

In [None]:
df_buckets = pd.DataFrame(merged.groupby('business_id').text.sum())

In [None]:
df_buckets = df_buckets.merge(pd.concat([city_business.business_id, city_business.stars], axis=1), on='business_id', how='left')

In [None]:
df_buckets.head()

* df_buckets contains all review text for each business as well as business average rating
* review text is the X matrix
* stars is the y vector

this is where we would remove puncuation, do stemming / lematization and other text processing to make the model better

we're going to skip that and come back to it after the first deadline

### Create and train model

In [None]:
stop = nltk.corpus.stopwords.words('english')
vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8, stop_words=stop)
X = vectorizer.fit_transform(df_buckets.text)
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, df_buckets.stars, test_size=0.2, random_state=0)
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

In [None]:
# confusion_matrix(y_test,predictions)
# classification_report(y_test,predictions)
# print(accuracy_score(y_test, predictions))