In [2]:
import pandas as pd

In [3]:
df_ngram = pd.read_csv("ngramApology.csv")

In [4]:
df_label = pd.read_csv("existence_label.csv")

In [5]:
df_dv = pd.DataFrame(df_label, columns = ["Label", "rating"])

In [7]:
df_merge = pd.concat([df_dv, df_ngram], axis = 1)

In [8]:
df_top_pos = df_merge[df_merge["Label"] == 1]
df_top_neg = df_merge[df_merge["Label"] == 0]

In [9]:
from sklearn.utils import resample
neg_upsample = resample(df_top_neg,
             replace=True,
             n_samples=len(df_top_pos),
             random_state=42)

print(neg_upsample.shape)

(519, 2242)


In [10]:
data_upsampled = pd.concat([df_top_pos, neg_upsample])

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.metrics import plot_roc_curve
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [16]:
data_upsampled.columns[2:].to_list()

['you_for_choos',
 'that_you_found',
 'aspect_of_your',
 'of_your_stay',
 'stay_we_appreci',
 'we_appreci_you',
 'we_have_the',
 'for_your_recent',
 'your_experi_with',
 'experi_with_us',
 'that_we_can',
 'of_our_hotel',
 'that_you_were',
 'that_you_have',
 'of_our_guest',
 'delight_to_hear',
 'hear_that_you',
 'that_you_enjoy',
 'appreci_your_feedback',
 'your_feedback_regard',
 'us_and_we',
 'and_we_hope',
 'you_with_a',
 'experi_at_our',
 'regret_that_we',
 'we_appreci_your',
 'allow_us_to',
 'hope_to_have',
 'to_have_the',
 'have_the_opportun',
 'opportun_to_serv',
 'much_for_take',
 'your_feedback_i',
 'of_your_expect',
 'the_experi_you',
 'onc_again_i',
 'i_apolog_for',
 'your_experi_and',
 'you_in_the',
 'you_for_your',
 'for_your_feedback',
 'you_for_the',
 'comment_regard_your',
 'regard_your_recent',
 'your_recent_stay',
 'recent_stay_at',
 'stay_at_our',
 'at_our_properti',
 'hope_to_see',
 'we_appreci_the',
 'sorri_you_were',
 'you_were_not',
 'we_do_hope',
 'much_for_your'

In [17]:
X_data = data_upsampled[data_upsampled.columns[2:].to_list()]
y_data = data_upsampled['Label']
X_train, X_test, y_train, y_test = train_test_split(X_data,
                            y_data, stratify=y_data, #keep the same ratio
                            test_size=0.1,
                            random_state=42)

In [18]:
lgclf = LogisticRegression(random_state=42, fit_intercept = True, solver = "liblinear", penalty ="l1").fit(X_train, y_train) 

In [22]:
print(cross_val_score(lgclf, X_train, y_train, cv=10))

[0.94680851 0.96808511 0.94680851 0.94680851 0.94623656 0.96774194
 0.95698925 0.93548387 0.93548387 0.98924731]


In [20]:
print("Accuracy Score:",lgclf.score(X_test, y_test))

Accuracy Score: 0.9326923076923077


In [23]:
lgclf.predict(X_test)

array([0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0])

In [24]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, lgclf.predict(X_test))

array([[51,  1],
       [ 6, 46]])

In [25]:
coeff_df = pd.DataFrame(X_data.columns)
coeff_df.columns = ['Features']
coeff_df["Coefficient"] = pd.Series(lgclf.coef_[0])

# preview
coeff_df.sort_values(by='Coefficient', inplace=True)
df_f0 = coeff_df[coeff_df["Coefficient"] == 0]
df_feature = coeff_df[coeff_df["Coefficient"] != 0]

In [26]:
df_feature

Unnamed: 0,Features,Coefficient
2236,condit,-1.623145
1828,one,-1.482151
2195,monica,-1.149840
2194,santa,-1.129585
1881,unfortun,-1.058204
...,...,...
1857,inconveni,1.238801
556,with_our,1.749713
1262,we_regret,2.703446
1704,apolog,7.402366


In [27]:
df_f0 

Unnamed: 0,Features,Coefficient
1457,best_regard,0.0
1447,more_about,0.0
1448,from_occur,0.0
1449,occur_in,0.0
1450,you_mention,0.0
...,...,...
741,take_a,0.0
742,will_make,0.0
728,as_the,0.0
738,our_room,0.0
