In [None]:
# A linear classification algorithm. Linear decision boundary.

# Generative vs Discriminative Classifier. LR - Discriminative (p(y|x) directly decision boundary)
# NB - Generative (Trying to estimate the distribution of features. How data is distributed)

# Assumes the following functional form for p(y|x) :  p(y=1|x) = (e**(w0+ sumi (wiXi)))/ (1 + e**(w0+ sumi (wiXi)))    ----   1 / (1 + e**(-z))  Logit function or Sigmoid
# Features can be discrete or continuous.

# w0 : bias --> shows a bias to a class. Change of coefficients changes the slope of linear boundary. so number of + or - changes.

# Link function : squeeze real line into [0,1] . sigmoid function is the function converting the score to a probability.

# If score is 0 then prob is 0.5.  sigmoid(score) vs score

# You are deciding to which class to assing the predictions by comparing the probabilities and if you look at the p(y=1|x)/p(y=0|x) then you will see the decision boundary is linear.

# Training the model means finding the optimal w's. p(y|xi,w) find such a w that asssigns 0 prob to negative data points and 1 to positive data points.

#  Likelihood l(w) : measures quality of fit for model with coefficiets w.

# iid data points, max multip. p(yi|xi,w)   Maximizing likelihood.

# There is no closed form solution therefore you need to use gradient ascent algorithm.
# C Concave function of w.  gradient ascent(concave),gradient descent(convex)

# Likelihood function is concave, - likelihood is convex function.

# Scaling is important because of the gradient descent algortihm.

# Easily extended to multiple class, natural prob view of class predictions, quick to train, very fast at classifying unknown records, good accuracy for many simple data sets like  text, can interpret model coefficients as indicators of feature importance(scaling is crucial)

# By using regularization you can have a chance to drop features because very big coefficients can be problematic since even though it changes slightly it can have a high effect on score.

# Disadvantage : Linear decision boundary (too simple for more complex problem)


In [None]:
!pip install opendatasets
import opendatasets as od
od.download("https://www.kaggle.com/datasets/pradeeshprabhakar/preprocessed-dataset-sentiment-analysis") # download data from kaggle datasets

In [24]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords 
from nltk.stem.snowball import SnowballStemmer
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as met
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
data = pd.read_csv(r"..\EcoPreprocessed.csv")

In [5]:
data = data[["review","division"]]

In [6]:
data.loc[data.division == "neutral","division"] = "negative"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data.division == "neutral","division"] = "negative"


In [7]:
data

Unnamed: 0,review,division
0,able play youtube alexa,positive
1,able recognize indian accent really well drop ...,positive
2,absolute smart device amazon connect external ...,positive
3,absolutely amaze new member family control hom...,positive
4,absolutely amaze previously sceptical invest m...,positive
...,...,...
4079,yo yo yo love go if want one smart speaker val...,positive
4080,youtube music,negative
4081,youtube support nahi kartasong recognise achha...,negative
4082,yup proscontrols wipro light amazinglysony bra...,negative


In [8]:
data.division.value_counts()


positive    3066
negative    1018
Name: division, dtype: int64

In [9]:
X = data["review"].values 
y = data["division"].values

In [10]:
X

array(['able play youtube alexa',
       'able recognize indian accent really well drop function helpful call device talk person near device smart plug schedule work seamlessly con would sound kindloud but lack clarity mid frequency need tweeked optimum clarity rarely device doesnt respond call alexa',
       'absolute smart device amazon connect external sub woofer sound amaze recons voice even close room like almost collection songs english hindi must quite moneys worth',
       ..., 'youtube support nahi kartasong recognise achha nahi',
       'yup proscontrols wipro light amazinglysony braviaconsjust take command wait minute confirmation',
       'zero integration capabilities fire tv devices can not use aux useless thing'],
      dtype=object)

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 7,stratify = y)

In [12]:
X_train = X_train.astype("U")
X_test = X_test.astype("U")

In [13]:
count_vectorizer = CountVectorizer()

count_vectorizer.fit(X_train) 

In [14]:
X_train_count_vector = count_vectorizer.transform(X_train)

In [15]:
X_test_count_vector = count_vectorizer.transform(X_test)

In [17]:
model = LogisticRegression()

model.fit(X_train_count_vector,y_train)

pred = model.predict(X_test_count_vector)

In [20]:
met.accuracy_score(y_test,pred)

0.9082007343941249

In [21]:
pred_prob = model.predict_proba(X_test_count_vector)

In [22]:
pred_prob

array([[7.48040614e-03, 9.92519594e-01],
       [6.43038059e-04, 9.99356962e-01],
       [4.44141599e-04, 9.99555858e-01],
       ...,
       [1.87038610e-01, 8.12961390e-01],
       [1.10770298e-01, 8.89229702e-01],
       [5.94233847e-01, 4.05766153e-01]])

In [23]:
# CountVector does not care about stopwords. Therefore they may be better solutions.

# TF-IDF : If a word is in just 1 sentence then it may be important but if every sentence contains that word it may not be important.
# Therefore instead of counting you can give TF_IDF score to a word.
# Suppose "the" word is used for multiple times in a sentence but also every sentence contains "the". Therefore its point is 0.
# You see "wondering" word for just 1 time in 1 sentence therefore its point is 0.3

# Term Frequency :How many times we see a word in the sentence.
# Inverse Document Frequency : If the word is in every sentence, then it is not important

In [25]:
tf_idf_vectorizer = TfidfVectorizer()

In [26]:
tf_idf_vectorizer.fit(X_train)

In [27]:
X_train_tf_idf = tf_idf_vectorizer.transform(X_train)

In [28]:
X_test_tf_idf = tf_idf_vectorizer.transform(X_test)

In [29]:
feature_array = np.array(tf_idf_vectorizer.get_feature_names())



In [30]:
tf_idf_scoring = np.argsort(X_train_tf_idf.toarray()).flatten()[::-1]

In [32]:
feature_array[tf_idf_scoring][:10] # 10 words with high tf-idf scores

array(['carity', 'childs', 'simple', 'awesome', 'use', 'like', 'sound',
       'firesticks', 'finish', 'finishingthat'], dtype='<U30')

In [33]:
feature_array[tf_idf_scoring][-10:] # 10 words with low tf-idf scores 

array(['prblm', 'prayers', 'prayer', 'prasagalu', 'practical', 'ppsamit',
       'ppl', 'powersource', 'powerpack', 'aa'], dtype='<U30')

In [34]:
model = LogisticRegression()

In [36]:
model.fit(X_train_tf_idf,y_train)

In [37]:
# In default Logistic Regression uses C coefficient and uses l2 penalty. 1/C = lambda

In [38]:
pred = model.predict(X_test_tf_idf)
accuracy_score(y_test,pred)


0.8653610771113831

In [None]:
# C (Regularization coefficient) - If you face with overfitting then you need to regularize your model. l2,l1
# Decision boundary is always linear. You can change slopes by using C.

In [39]:
# Regularization is so much therefore we will face with underfitting.
model = LogisticRegression(C = 0.000001)
model.fit(X_train_tf_idf,y_train)

In [40]:
pred = model.predict(X_train_tf_idf)
accuracy_score(y_train,pred)

0.7508417508417509

In [41]:
pred = model.predict(X_test_tf_idf)
accuracy_score(y_test,pred)  # testte daha da kötü bir performans gösterdi.


0.7503059975520195

In [42]:
# Since regularization is so low, we will face with overfitting

model = LogisticRegression(C = 100000)
model.fit(X_train_tf_idf,y_train)

pred = model.predict(X_train_tf_idf)
accuracy_score(y_train,pred)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9978573614937252

In [43]:
pred = model.predict(X_test_tf_idf)
accuracy_score(y_test,pred)

0.8910648714810282

In [None]:
#max_iter parameter is used to limit the number of iteration to solve with gradient descent.

# Gradient descent : w(t+1) = w(t) - n*derivative(f(x;w),y)   ---take derivative wrt w and multiply with x . 
# Start w's as  0 0 0 0... then take derivative and update by multiply with learning rate. 

# gradient shows how off you are from the actual results.