<a href="https://colab.research.google.com/github/kratikagupta6394/sentiment-analysis-with-scikit-learn/blob/main/sentiment_analysis_with_scikit_learn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

df = pd.read_csv('movie_data.csv')


In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()

docs = np.array(['The sun is shining',
                'The weather is sweet',
                'The sun is shining, The weather is sweet, and one and one is two'])

bag = count.fit_transform(docs)
print(count.vocabulary_)
# vocab is stored in a dictionary which maps the unique words in the document to integers
# now to print a feature vector we convert bag into an array
print(bag.toarray()) 

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
np.set_printoptions(precision=2)
tfidf = TfidfTransformer(use_idf = True, norm ='l2', smooth_idf = True)

print(tfidf.fit_transform(bag).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [None]:
#data preparation
df.loc[0,'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [None]:
import re
def preprocessor(text):
  text = re.sub('<[^<.]*>','',text)
  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(D|P)',text)
  text = re.sub('[\W]+',' ',text.lower()) +\
      ' '.join(emoticons).replace('-','')
  return text

In [None]:
preprocessor(df.loc[0,'review'][-50:])

'is seven title brazil not available'

In [None]:
df['review'] = df['review'].apply(preprocessor)

Task 5:

In [None]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

In [None]:
def tokenizer(text):
  return text.split()

In [None]:
def tokenizer_stemming(text):
  return [porter.stem(word) for word in text.split()]

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_stemming('a runner likes running and runs a lot')[-10:]if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

Task 6: Transfor Text Data to TF-IDF Vectors


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase = False,
                        preprocessor = None,
                        tokenizer = tokenizer_stemming,
                        use_idf = True,
                        norm = 'l2',
                        smooth_idf = True)
y = df.sentiment.values
X = tfidf.fit_transform(df.review)

Task 7:Document Classification using Logistic Regression


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 1,
                                                    test_size = 0.5,
                                                    shuffle = False)


In [None]:
import pickle
from sklearn.linear_model import LogisticRegressionCV

clf = LogisticRegressionCV(cv = 5,
                           scoring = 'accuracy',
                           random_state = 0,
                           n_jobs = -1,
                           verbose = 3,
                           max_iter = 300).fit(X_train, y_train)

saved_model = open('saved_model.sav', 'wb')
pickle.dump(clf,saved_model)
saved_model.close()
                          

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.5min finished


Task 8: Model Evaluation 

In [None]:
filename = 'saved_model.sav'
saved_clf = pickle.load(open(filename, 'rb'))

In [None]:
saved_clf.score(X_test, y_test)

0.8962