In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [55]:
# listings = pd.read_csv('../data/cities/listings_Antwerp.csv.gz', compression='gzip')
reviews = pd.read_csv('../data/cities/reviews_Rotterdam_en.csv.gz', compression='gzip')

Ground_truth: review_scores_rating

## Preparing llm

In [56]:
model_name = "sohan-ai/sentiment-analysis-model-amazon-reviews"
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained(model_name)
model.to('cuda')

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [57]:
def add_sentiment_features(row):
    global count
    count += 1
    if count % 1000 == 0:
        print(f'row: {count} | index: {row.index}')
    #if count > 14500:
    #    if count % 1 == 0:
    #        print(f'row: {count} | index: {row.index}')
    text = row['comments']
    inputs = tokenizer(text, return_tensors="pt").to('cuda')
    
    outputs = model(**inputs)
    logits = outputs.logits.detach().cpu().numpy()
    return logits[0][0], logits[0][1]

In [58]:
count = 0


reviews['comments'] = reviews['comments'].astype(str)
reviews['comments'] = reviews['comments'].apply(lambda x: x[:512] if len(x) > 512 else x)
reviews['feature_1'], reviews['feature_2'] = zip(*reviews.apply(add_sentiment_features, axis=1))

row: 1000 | index: Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], dtype='object')
row: 2000 | index: Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], dtype='object')
row: 3000 | index: Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], dtype='object')
row: 4000 | index: Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], dtype='object')
row: 5000 | index: Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], dtype='object')
row: 6000 | index: Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], dtype='object')
row: 7000 | index: Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], dtype='object')
row: 8000 | index: Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], dtype='object')
row: 9000 | index: Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_n

In [59]:
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,feature_1,feature_2
0,73155,286470,2011-05-29,516906,Daniela,Kim is a great host full of attention and care...,-2.475608,2.485249
1,73155,337681,2011-06-27,570117,Lillian,"The apartment is really nice, extremely clean ...",-2.488816,2.508798
2,73155,358679,2011-07-07,129200,Andre And Rossana,Kim is a wonderful host with a beautiful apart...,-2.045111,2.027211
3,73155,364109,2011-07-10,660393,Denise,Kim's place is terrific. The apartment is very...,-2.849687,2.898854
4,73155,368273,2011-07-12,232321,Evert,Most central location you could wish for. Apar...,-2.767707,2.808869


In [68]:
X = reviews.drop(columns=['listing_id','id','date','reviewer_id','reviewer_name','comments'])

KeyError: "['xgb_predict'] not found in axis"

In [73]:
X = X.drop(columns=['xgb_predict'])

In [69]:
X

Unnamed: 0,feature_1,feature_2,xgb_predict
0,-2.475608,2.485249,4
1,-2.488816,2.508798,4
2,-2.045111,2.027211,4
3,-2.849687,2.898854,4
4,-2.767707,2.808869,4
...,...,...,...
26983,-3.141326,3.146073,4
26984,-2.100139,2.079285,4
26985,-2.378335,2.414392,4
26986,-2.738594,2.780065,4


In [74]:
with open('XGBoost_kaggle_data_classifier.pkl', 'rb') as file:
    xgb_loaded = pickle.load(file)

# Use the loaded model to make predictions
reviews['xgb_predict'] = xgb_loaded.predict(X) + 1

In [75]:
with open('random_forest_kaggle_data_classifier.pkl', 'rb') as file:
    rf_loaded = pickle.load(file)

# Use the loaded model to make predictions
reviews['rf_predict'] = rf_loaded.predict(X) + 1

In [79]:
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,feature_1,feature_2,xgb_predict,rf_predict
0,73155,286470,2011-05-29,516906,Daniela,Kim is a great host full of attention and care...,-2.475608,2.485249,5,5
1,73155,337681,2011-06-27,570117,Lillian,"The apartment is really nice, extremely clean ...",-2.488816,2.508798,5,5
2,73155,358679,2011-07-07,129200,Andre And Rossana,Kim is a wonderful host with a beautiful apart...,-2.045111,2.027211,5,5
3,73155,364109,2011-07-10,660393,Denise,Kim's place is terrific. The apartment is very...,-2.849687,2.898854,5,5
4,73155,368273,2011-07-12,232321,Evert,Most central location you could wish for. Apar...,-2.767707,2.808869,5,5
