In [1]:
# Import libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv('/content/Laptop_Train_v2.csv')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
df

Unnamed: 0,id,Sentence,Aspect Term,polarity,from,to
0,2339,I charge it at night and skip taking the cord ...,cord,neutral,41,45
1,2339,I charge it at night and skip taking the cord ...,battery life,positive,74,86
2,1316,The tech guy then said the service center does...,service center,negative,27,41
3,1316,The tech guy then said the service center does...,"""sales"" team",negative,109,121
4,1316,The tech guy then said the service center does...,tech guy,neutral,4,12
...,...,...,...,...,...,...
2353,2272,We also use Paralles so we can run virtual mac...,Windows Server Enterprise 2003,neutral,104,134
2354,2272,We also use Paralles so we can run virtual mac...,Windows Server 2008 Enterprise,neutral,140,170
2355,848,"How Toshiba handles the repair seems to vary, ...",repair,conflict,24,30
2356,848,"How Toshiba handles the repair seems to vary, ...",repair,positive,130,136


In [3]:
map_polarity = {'neutral': 2, 'positive': 1, "negative": 0} # map data

df["polarity"] = df["polarity"].map(map_polarity)

In [4]:
df=df.drop('from',axis=1)

In [5]:
df=df.drop('to',axis=1)

In [6]:
df.isnull().sum()

id              0
Sentence        0
Aspect Term     0
polarity       45
dtype: int64

In [7]:
df=df.dropna()

In [8]:
df

Unnamed: 0,id,Sentence,Aspect Term,polarity
0,2339,I charge it at night and skip taking the cord ...,cord,2.0
1,2339,I charge it at night and skip taking the cord ...,battery life,1.0
2,1316,The tech guy then said the service center does...,service center,0.0
3,1316,The tech guy then said the service center does...,"""sales"" team",0.0
4,1316,The tech guy then said the service center does...,tech guy,2.0
...,...,...,...,...
2352,2272,We also use Paralles so we can run virtual mac...,Windows 7 Home Premium,2.0
2353,2272,We also use Paralles so we can run virtual mac...,Windows Server Enterprise 2003,2.0
2354,2272,We also use Paralles so we can run virtual mac...,Windows Server 2008 Enterprise,2.0
2356,848,"How Toshiba handles the repair seems to vary, ...",repair,1.0


In [9]:
df.shape

(2313, 4)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2313 entries, 0 to 2357
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           2313 non-null   int64  
 1   Sentence     2313 non-null   object 
 2   Aspect Term  2313 non-null   object 
 3   polarity     2313 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 90.4+ KB


In [11]:
# Preprocess the data
def preprocess_text(Sentence):
    # Remove special characters and digits
    Sentence = re.sub('[^a-zA-Z]', ' ', Sentence)
    # Convert to lowercase
    Sentence = Sentence.lower()
    # Tokenize the text
    tokens = word_tokenize(Sentence)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Join the tokens back into a string
    Sentence = ' '.join(tokens)
    return Sentence



In [12]:
# Apply the preprocessing function to the text column
Sentence = df['Sentence'].apply(preprocess_text)



In [13]:
Sentence

0         charge night skip taking cord good battery life
1         charge night skip taking cord good battery life
2       tech guy said service center exchange direct c...
3       tech guy said service center exchange direct c...
4       tech guy said service center exchange direct c...
                              ...                        
2352    also use paralles run virtual machines windows...
2353    also use paralles run virtual machines windows...
2354    also use paralles run virtual machines windows...
2356    toshiba handles repair seems vary folks indica...
2357    would like use different operating system alto...
Name: Sentence, Length: 2313, dtype: object

In [14]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Sentence'], df['polarity'] ,test_size=0.2, random_state=42)



In [15]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of y_test: ", y_test.shape)


Shape of X_train:  (1850,)
Shape of X_test:  (463,)
Shape of y_train:  (1850,)
Shape of y_test:  (463,)


In [16]:
X_train.head()

886     You have to toss out the wifi card and replace...
1075    I have Vista, so I am unable to install and un...
367     Laptop was in new condition and operational, b...
1605    Battery is lasting about 6 hours as I am surfi...
358     Windows XP SP2 caused many problems on the com...
Name: Sentence, dtype: object

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('KNN', KNeighborsClassifier())         
])

In [18]:
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.61      0.69      0.65       181
         1.0       0.62      0.81      0.70       168
         2.0       0.61      0.22      0.32       114

    accuracy                           0.62       463
   macro avg       0.61      0.57      0.56       463
weighted avg       0.61      0.62      0.59       463



In [19]:
from sklearn.naive_bayes import MultinomialNB


#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.70      0.75      0.73       181
         1.0       0.61      0.92      0.74       168
         2.0       1.00      0.17      0.29       114

    accuracy                           0.67       463
   macro avg       0.77      0.61      0.58       463
weighted avg       0.74      0.67      0.62       463



In [20]:
from sklearn.ensemble import RandomForestClassifier

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer(ngram_range = (3,3))),        #using the ngram_range parameter 
     ('Random Forest', RandomForestClassifier())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.80      0.41      0.54       181
         1.0       0.50      0.92      0.65       168
         2.0       0.75      0.40      0.53       114

    accuracy                           0.59       463
   macro avg       0.68      0.58      0.57       463
weighted avg       0.68      0.59      0.58       463



**BOW**

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<1850x2866 sparse matrix of type '<class 'numpy.int64'>'
	with 30101 stored elements in Compressed Sparse Row format>

In [22]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
X_train_cv.shape

(1850, 2866)

In [24]:
v.vocabulary_

{'you': 2859,
 'have': 1141,
 'to': 2568,
 'toss': 2583,
 'out': 1764,
 'the': 2529,
 'wifi': 2805,
 'card': 416,
 'and': 160,
 'replace': 2090,
 'it': 1340,
 'just': 1360,
 'any': 170,
 'sort': 2345,
 'of': 1711,
 'network': 1655,
 'capability': 412,
 'vista': 2730,
 'so': 2321,
 'am': 147,
 'unable': 2640,
 'install': 1294,
 'uninstall': 2651,
 'some': 2332,
 'programs': 1949,
 'laptop': 1398,
 'was': 2758,
 'in': 1258,
 'new': 1659,
 'condition': 547,
 'operational': 1742,
 'but': 385,
 'for': 1001,
 'audio': 241,
 'problem': 1933,
 'when': 2794,
 '1st': 17,
 'sent': 2227,
 'repair': 2086,
 'battery': 272,
 'is': 1330,
 'lasting': 1405,
 'about': 71,
 'hours': 1204,
 'as': 213,
 'surfing': 2466,
 'web': 2773,
 'on': 1725,
 'sundays': 2454,
 'while': 2799,
 'checking': 458,
 'football': 1000,
 'scores': 2186,
 'watching': 2763,
 'funny': 1044,
 'youtube': 2862,
 'videos': 2720,
 'windows': 2812,
 'xp': 2851,
 'sp2': 2349,
 'caused': 432,
 'many': 1540,
 'problems': 1934,
 'computer':

In [25]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [26]:
np.where(X_train_np[0]!=0)

(array([ 160,  170,  412,  416, 1141, 1340, 1360, 1655, 1711, 1764, 2090,
        2345, 2529, 2568, 2583, 2805, 2859]),)

Train the naive bayes model

In [27]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [28]:
X_test_cv = v.transform(X_test)

In [29]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.74      0.75      0.75       181
         1.0       0.68      0.87      0.76       168
         2.0       0.69      0.39      0.50       114

    accuracy                           0.71       463
   macro avg       0.70      0.67      0.67       463
weighted avg       0.71      0.71      0.69       463



Train the model using sklearn pipeline and reduce number of lines of code

In [30]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [31]:
clf.fit(X_train, y_train)

In [32]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.74      0.75      0.75       181
         1.0       0.68      0.87      0.76       168
         2.0       0.69      0.39      0.50       114

    accuracy                           0.71       463
   macro avg       0.70      0.67      0.67       463
weighted avg       0.71      0.71      0.69       463



## GgloVe

In [33]:
num_word=len(Sentence)

In [34]:
num_word

2313

In [35]:
Sentence[0]

'charge night skip taking cord good battery life'

In [36]:
train_size=int(train.shape[0]*0.8)

NameError: ignored