In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

Mounted at /content/drive


# 1.Load data

In [None]:
# read data and rename the column names
path = '/content/drive/MyDrive/DSO 560 NLP Team Project/'
df_1 = pd.read_csv(f'{path}clean_data/bumble_hinge_review.csv')
df = df_1[df_1['App']=='Bumble'].copy()
df.dropna(inplace=True)
df.reset_index(inplace=True)
df.rename(columns={'Review':'text','App':'app','Rating':'score','index':'id'},inplace=True)
df = df[['id','text','score','app']]

# split the data into training set and testing set
train_df, test_df = train_test_split(df, test_size=0.3, random_state=0, stratify=df['score'])

# 2.TF-IDF Vectorization

In [None]:
# initialize the vectorization
nltk.download('stopwords')
en_stops = nltk.corpus.stopwords.words('english')
vectorizer = TfidfVectorizer(ngram_range=(1,3),
                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                             max_df=0.2, max_features=200, stop_words=en_stops)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#generate the tf-idf vectorization
train_corpus = list(train_df['text'].values)
test_corpus = list(test_df['text'].values)

X_train = vectorizer.fit_transform(train_corpus)
X_test = vectorizer.fit_transform(test_corpus)

terms = vectorizer.get_feature_names()
tf_idf_train = pd.DataFrame(X_train.toarray().transpose(), index=terms)
tf_idf_test = pd.DataFrame(X_test.toarray().transpose(), index=terms)



In [None]:
# change data type
X_train = X_train.astype('float')
X_test = X_test.astype('float')

# 3. LightGBM Model

In [None]:
# initialize and fit the model
model = lgb.LGBMRegressor(n_estimators=3000, max_depth=4, random_state=0)

y_train = train_df['score'].values
y_test = test_df['score'].values

model.fit(X_train, y_train)

LGBMRegressor(max_depth=4, n_estimators=3000, random_state=0)

In [None]:
# calculate mean squared error
y_pred = model.predict(X_test)
np.mean((y_pred-y_test)**2)*0.5

1.0432650472527287

In [None]:
# report r^2 value
model.score(X_train, y_train)

0.5967744975742327