# Importing Modules & Data

In [9]:
# Import Pandas and NumPy
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
data_path = '/content/drive/MyDrive/Colab Notebooks/CS512/Tutorials_colab/balanced_reviews_small.csv'

# Import the data

reviews_df = pd.read_csv(data_path, index_col=0)

# Print head of the data
print(reviews_df.head())

                                            sentence  label
0  2.5 yıldır kızım için prima kullanıyorum ve ga...      1
1  ürün başlıkların girişleri hariç çok iyi. kull...      1
2  bugune dek defalarca küçük bedenlerini aldığım...      0
3        her yaptığı kaka olduğu gibi dışına cikiyor      0
4  hepsi burada çok hızlı sipariş verdim çabuk ge...      1


In [11]:
# Print the shape of the data
reviews_df.shape

(2000, 2)

In [49]:
print(reviews_df['label'].value_counts().to_frame().T)

          1     0
label  1000  1000


In [13]:
# Print the class distribution
print(reviews_df['label'].value_counts().to_frame().T)

reviews_df[reviews_df['label'] == 1].shape
reviews_df[reviews_df['label'] == 0].shape

(1000, 2)

In [15]:
# Import train_test_split from sklearn.model_selection
from sklearn.model_selection import train_test_split
# Extract X and y
X = reviews_df['sentence'].values
y = reviews_df['label'].values

# Use train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [16]:
# Shape of Train Data
print("Shape of train data:", X_train.shape)
# Class distribution in Train Data
print("# of positive samples in training set", X_train[y_train == 1].shape)
print("# of positive samples in training set", X_train[y_train == 0].shape)

Shape of train data: (1600,)
# of positive samples in training set (800,)
# of positive samples in training set (800,)


In [17]:
# Shape of Test Data
print("Shape of train data:", X_test.shape)
# Class distribution in Test Data
print("# of positive samples in training set", X_test[y_test == 1].shape)
print("# of positive samples in training set", X_test[y_test == 0].shape)

Shape of train data: (400,)
# of positive samples in training set (200,)
# of positive samples in training set (200,)


In [20]:
# Change Train and Test encoding as unicode
X_train = X_train.astype('U')
X_test = X_test.astype('U')

# Model 1

## Feature Generation With Count Vectors

![Count Vector](https://drive.google.com/uc?export=view&id=1a27TPRbB94wTvrGVrce58Tvjg_XefYWr)
https://www.oreilly.com/library/view/applied-text-analysis/9781491963036/assets/atap_0402.png

In [23]:
# Import CountVectorizer from sklearn.feature_extraction.text
from sklearn.feature_extraction.text import CountVectorizer
# Create CountVectorizer object
cv = CountVectorizer()

In [24]:
# Fit on Training Data
cv.fit(X_train)
# Get Count Vectors for Train Data
X_train_count_vector = cv.transform(X_train)

In [25]:
# Get Count Vectors for Test Data
X_test_count_vector = cv.transform(X_test)

In [37]:
X_test_count_vector

(400, 6992)

In [39]:
X_train.shape

(1600,)

## Model Evaluation

In [27]:
# Import LogisticRegression from sklearn.linear_model
from sklearn.linear_model import LogisticRegression
# Create LogisticRegression object
model = LogisticRegression()

In [28]:
# Fit Train Data
model.fit(X_train_count_vector, y_train)

LogisticRegression()

In [29]:
# Get Predictions for Test Data
y_pred = model.predict(X_test_count_vector)

In [30]:
# Check first 5 predictions
y_pred[0:5]

array([1, 1, 1, 0, 0])

In [31]:
# Import accuracy_score Score from sklearn.metrics
from sklearn.metrics import accuracy_score
# Print Accuracy
acc = accuracy_score(y_pred=y_pred, y_true=y_test)
print("Accuracy of model using count_vectors approach:", acc)

Accuracy of model using count_vectors approach: 0.88


In [34]:
# Check prediction probabilities for X_test 
y_pred_probs = model.predict_proba(X_test_count_vector)
y_pred_probs[:10]
print(y_pred_probs[:10].sum(axis=1))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [45]:
X_train_count_vector

<1600x6992 sparse matrix of type '<class 'numpy.int64'>'
	with 24555 stored elements in Compressed Sparse Row format>

In [48]:

model.coef_[]

IndexError: ignored

# Model 2

## Feature Generation With TF - IDF Vectors

![TF - IDF Vector](https://drive.google.com/uc?export=view&id=1bzVkWEj77tDKBK4sB7UNq90NGbi0Wes8)
https://www.oreilly.com/library/view/applied-text-analysis/9781491963036/assets/atap_0404.png

In [None]:
# Import TfidfVectorizer from sklearn.feature_extraction.text

# Create TfidfVectorizer object


In [None]:
# Fit on Train Data

# Get TF-IDF scores for Train Data


In [None]:
# Get TF-IDF scores for Test Data


In [None]:
# https://stackoverflow.com/questions/34232190/scikit-learn-tfidfvectorizer-how-to-get-top-n-terms-with-highest-tf-idf-score

# Get Feature Names from vectorizer 

# Sort args


In [None]:
# High TF-IDF Scores


In [None]:
# Low TF-IDF Scores


## Model Evaluation

In [None]:
# Create LogisticRegression object


In [None]:
# Fit Train Data


In [None]:
# Get Predictions for Test Data


In [None]:
# Check first 5 predictions


In [None]:
# Print Accuracy


# Hyperparameters of Logistic Regression

## C (Regularization )

![TF - IDF Vector](https://drive.google.com/uc?export=view&id=1DpRXzvRSsgZyzuLIpW8cSSh9YTxXE50A)

https://towardsdatascience.com/underfitting-and-overfitting-in-machine-learning-and-how-to-deal-with-it-6fe4a8a49dbf

In [None]:

# Create LogisticRegression model with C = 0.0000001
model = LogisticRegression(C=0.0000001)
# Fit Train Data
model.fit(X_train_tfidf,y_train)

In [51]:
# Get Predictions for Train Data

# Print Accuracy on Train data


In [None]:
# Get Predictions for Test Data

# Print Accuracy on Test data


In [None]:
# Create LogisticRegression model with C = 10000000

# Fit Train Data


In [None]:
# Get Predictions for Train Data

# Print Accuracy on Train data


In [None]:
# Get Predictions for Test Data

# Print Accuracy on Test data


## max_iter

![TF - IDF Vector](https://drive.google.com/uc?export=view&id=1wQwQoHd1eV0gkYiksfHwXo5kPAoRNNf_)

https://towardsdatascience.com/optimization-loss-function-under-the-hood-part-ii-d20a239cde11

In [None]:
# Create LogisticRegression model with max_iter = 1

# Fit Train Data


In [None]:
# Get Predictions for Train Data

# Print Accuracy on Train data


In [None]:
# Get Predictions for Test Data

# Print Accuracy on Test data


In [None]:
# Create LogisticRegression model with max_iter = 100000

# Fit Train Data


In [None]:
# Get Predictions for Train Data

# Print Accuracy on Train data


In [None]:
# Get Predictions for Test Data

# Print Accuracy on Test data
