In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import plot_confusion_matrix
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('IMDB_Dataset.csv')
df['label'] = (df['sentiment']=='positive').astype(int)
df.rename({'review': 'text'}, axis=1, inplace=True)
df.drop('sentiment', axis=1, inplace=True)

In [None]:
df_train, df_valid = train_test_split(df, test_size=0.2)
df.head()

In [None]:
train_texts = []
for t in df_train['text']:
  train_texts.append(t)

test_texts = []
for t in df_valid['text']:
  test_texts.append(t)

train_labels = []
for t in df_train['label']:
  train_labels.append(t)

test_labels = []
for t in df_valid['label']:
  test_labels.append(t)

In [None]:
mindf = .04
maxdf = .7
X_train = train_texts
vectorizer = TfidfVectorizer(min_df=mindf, max_df=maxdf, stop_words="english")
X_train = vectorizer.fit_transform(X_train)
X_train = StandardScaler().fit_transform(X_train.todense())

In [None]:
y_train = train_labels
lr = LogisticRegression().fit(X_train, y_train)

In [None]:
X_test = test_texts
vectorizer = TfidfVectorizer(min_df=mindf, max_df=maxdf, stop_words="english", max_features=len(lr.coef_[0]))
X_test = vectorizer.fit_transform(X_test)
X_test = StandardScaler().fit_transform(X_test.todense())X_test.shape, X_train.shape

In [None]:
X_test = vectorizer.fit_transform(df_valid['text'])
X_test = StandardScaler().fit_transform(X_test.todense())
y_test = df_valid['text']
#stops complaint about different features
#X_test = np.pad(X_test, (0, X_train.shape[1]-X_test.shape[1]))

In [None]:
len(lr.coef_[0])

In [None]:
X_test.shape, X_train.shape

In [None]:
y_test = test_labels
subsample = 5000
plot_confusion_matrix(lr, X_test[:subsample], y_test[:subsample], values_format="")

In [None]:
#base filled in values
con_mat_df = pd.DataFrame(np.array([[1682, 836],
                                    [809, 1673]]),
                               columns=["negative", "positive"],
                               index=["negative", "positive"])
f,ax = plt.subplots(figsize=(5, 3))
sns.heatmap(con_mat_df, annot=True, vmax=2200, vmin=200, cmap=plt.cm.Blues,fmt='g')
plt.tight_layout()
ax.set_title('Logistic TFIDF model', fontsize=16)
ax.set_xlabel('Predicted', fontsize=14)
ax.set_ylabel('True', fontsize=14)
plt.show()

In [None]:
(1682+1673) / 5000

In [None]:
#first run
con_mat_df = pd.DataFrame(np.array([[2266, 277],
                                    [272, 2185]]),
                               columns=["negative", "positive"],
                               index=["negative", "positive"])
f,ax = plt.subplots(figsize=(5, 3))
sns.heatmap(con_mat_df, annot=True, vmax=2200, vmin=200, cmap=plt.cm.Blues,fmt='g')
plt.tight_layout()
ax.set_title('BERT-base model', fontsize=16)
ax.set_xlabel('Predicted', fontsize=14)
ax.set_ylabel('True', fontsize=14)
plt.show()