In [1]:
# Import libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv('/content/Laptop_Train_v2.csv')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
df

Unnamed: 0,id,Sentence,Aspect Term,polarity,from,to
0,2339,I charge it at night and skip taking the cord ...,cord,neutral,41,45
1,2339,I charge it at night and skip taking the cord ...,battery life,positive,74,86
2,1316,The tech guy then said the service center does...,service center,negative,27,41
3,1316,The tech guy then said the service center does...,"""sales"" team",negative,109,121
4,1316,The tech guy then said the service center does...,tech guy,neutral,4,12
...,...,...,...,...,...,...
2353,2272,We also use Paralles so we can run virtual mac...,Windows Server Enterprise 2003,neutral,104,134
2354,2272,We also use Paralles so we can run virtual mac...,Windows Server 2008 Enterprise,neutral,140,170
2355,848,"How Toshiba handles the repair seems to vary, ...",repair,conflict,24,30
2356,848,"How Toshiba handles the repair seems to vary, ...",repair,positive,130,136


In [3]:
map_polarity = {'neutral': 2, 'positive': 1, "negative": 0} # map data

df["polarity"] = df["polarity"].map(map_polarity)

In [4]:
df=df.drop('from',axis=1)

In [5]:
df=df.drop('to',axis=1)

In [6]:
df

Unnamed: 0,id,Sentence,Aspect Term,polarity
0,2339,I charge it at night and skip taking the cord ...,cord,2.0
1,2339,I charge it at night and skip taking the cord ...,battery life,1.0
2,1316,The tech guy then said the service center does...,service center,0.0
3,1316,The tech guy then said the service center does...,"""sales"" team",0.0
4,1316,The tech guy then said the service center does...,tech guy,2.0
...,...,...,...,...
2353,2272,We also use Paralles so we can run virtual mac...,Windows Server Enterprise 2003,2.0
2354,2272,We also use Paralles so we can run virtual mac...,Windows Server 2008 Enterprise,2.0
2355,848,"How Toshiba handles the repair seems to vary, ...",repair,
2356,848,"How Toshiba handles the repair seems to vary, ...",repair,1.0


In [7]:
df.shape

(2358, 4)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2358 entries, 0 to 2357
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           2358 non-null   int64  
 1   Sentence     2358 non-null   object 
 2   Aspect Term  2358 non-null   object 
 3   polarity     2313 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 73.8+ KB


In [9]:
# Preprocess the data
def preprocess_text(Sentence):
    # Remove special characters and digits
    Sentence = re.sub('[^a-zA-Z]', ' ', Sentence)
    # Convert to lowercase
    Sentence = Sentence.lower()
    # Tokenize the text
    tokens = word_tokenize(Sentence)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Join the tokens back into a string
    Sentence = ' '.join(tokens)
    return Sentence



In [10]:
# Apply the preprocessing function to the text column
df['Sentence'] = df['Sentence'].apply(preprocess_text)



In [11]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Sentence'], df['Aspect Term'] ,test_size=0.2, random_state=42)



In [12]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of y_test: ", y_test.shape)


Shape of X_train:  (1886,)
Shape of X_test:  (472,)
Shape of y_train:  (1886,)
Shape of y_test:  (472,)


In [13]:
X_train.head()

230     course also several great software packages ca...
485                                   apple care included
1990    apparently well built gorgeous look macbook pr...
1499                  mouse pad left button always sticks
940          love stability mac software operating system
Name: Sentence, dtype: object

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('KNN', KNeighborsClassifier())         
])

In [15]:
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

                                            precision    recall  f1-score   support

                           1-year-warranty       0.00      0.00      0.00         1
                                    10-key       0.00      0.00      0.00         0
                           12 cell battery       0.00      0.00      0.00         1
                                   15 inch       0.00      0.00      0.00         0
                                       15"       0.00      0.00      0.00         0
                            17 ince screen       0.00      0.00      0.00         0
                            17 inch screen       0.00      0.00      0.00         1
                                   18-inch       0.00      0.00      0.00         1
                              18.4" screen       0.00      0.00      0.00         0
                               22" Monitor       0.00      0.00      0.00         1
                       2GB stick of memory       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
from sklearn.naive_bayes import MultinomialNB


#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

                                precision    recall  f1-score   support

               1-year-warranty       0.00      0.00      0.00         1
               12 cell battery       0.00      0.00      0.00         1
                17 inch screen       0.00      0.00      0.00         1
                       18-inch       0.00      0.00      0.00         1
                   22" Monitor       0.00      0.00      0.00         1
           2GB stick of memory       0.00      0.00      0.00         1
                    3G network       0.00      0.00      0.00         1
              500gb hard drive       0.00      0.00      0.00         1
                       8GB RAM       0.00      0.00      0.00         1
                    8GB of RAM       0.00      0.00      0.00         1
                 AC power port       0.00      0.00      0.00         1
               Apple Care plan       0.00      0.00      0.00         1
            Apple applications       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
from sklearn.ensemble import RandomForestClassifier

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer(ngram_range = (3,3))),        #using the ngram_range parameter 
     ('Random Forest', RandomForestClassifier())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

                                precision    recall  f1-score   support

               1-year-warranty       0.00      0.00      0.00         1
               12 cell battery       0.00      0.00      0.00         1
                           15"       0.00      0.00      0.00         0
                17 inch screen       0.00      0.00      0.00         1
                       18-inch       0.00      0.00      0.00         1
                   22" Monitor       0.00      0.00      0.00         1
           2GB stick of memory       0.00      0.00      0.00         1
                    3G network       0.00      0.00      0.00         1
                    4GB of RAM       0.00      0.00      0.00         0
              500gb hard drive       0.00      0.00      0.00         1
                       8GB RAM       0.00      0.00      0.00         1
                    8GB of RAM       0.00      0.00      0.00         1
                 AC power port       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**BOW**

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<1886x2693 sparse matrix of type '<class 'numpy.int64'>'
	with 17794 stored elements in Compressed Sparse Row format>

In [19]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [20]:
X_train_cv.shape

(1886, 2693)

In [21]:
v.vocabulary_

{'course': 518,
 'also': 72,
 'several': 2105,
 'great': 1018,
 'software': 2188,
 'packages': 1662,
 'came': 315,
 'free': 935,
 'including': 1170,
 'iwork': 1248,
 'garageband': 968,
 'imovie': 1149,
 'apple': 114,
 'care': 328,
 'included': 1169,
 'apparently': 110,
 'well': 2625,
 'built': 286,
 'gorgeous': 1004,
 'look': 1378,
 'macbook': 1405,
 'pro': 1805,
 'winning': 2646,
 'combination': 415,
 'price': 1794,
 'performance': 1704,
 'mouse': 1513,
 'pad': 1664,
 'left': 1323,
 'button': 301,
 'always': 76,
 'sticks': 2279,
 'love': 1392,
 'stability': 2247,
 'mac': 1404,
 'operating': 1621,
 'system': 2337,
 'either': 732,
 'computer': 456,
 'slow': 2172,
 'detect': 619,
 'keys': 1273,
 'typed': 2484,
 'unlikely': 2506,
 'typist': 2488,
 'keyboard': 1270,
 'simply': 2145,
 'everything': 782,
 'fine': 882,
 'machine': 1408,
 'speed': 2227,
 'capacity': 324,
 'build': 284,
 'first': 891,
 'full': 950,
 'charge': 357,
 'battery': 190,
 'got': 1005,
 'hours': 1111,
 'fan': 847,
 've

In [22]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
np.where(X_train_np[0]!=0)

(array([  72,  315,  518,  935,  968, 1018, 1149, 1170, 1248, 1662, 2105,
        2188]),)

Train the naive bayes model

In [24]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [25]:
X_test_cv = v.transform(X_test)

In [26]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

                                precision    recall  f1-score   support

               1-year-warranty       0.00      0.00      0.00         1
               12 cell battery       0.00      0.00      0.00         1
                17 inch screen       0.00      0.00      0.00         1
                       18-inch       0.00      0.00      0.00         1
                   22" Monitor       0.00      0.00      0.00         1
           2GB stick of memory       0.00      0.00      0.00         1
                    3G network       0.00      0.00      0.00         1
                    4GB of RAM       0.00      0.00      0.00         0
              500gb hard drive       0.00      0.00      0.00         1
                       8GB RAM       0.00      0.00      0.00         1
                    8GB of RAM       0.00      0.00      0.00         1
                 AC power port       0.00      0.00      0.00         1
               Apple Care plan       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train the model using sklearn pipeline and reduce number of lines of code

In [27]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [28]:
clf.fit(X_train, y_train)

In [29]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

                                precision    recall  f1-score   support

               1-year-warranty       0.00      0.00      0.00         1
               12 cell battery       0.00      0.00      0.00         1
                17 inch screen       0.00      0.00      0.00         1
                       18-inch       0.00      0.00      0.00         1
                   22" Monitor       0.00      0.00      0.00         1
           2GB stick of memory       0.00      0.00      0.00         1
                    3G network       0.00      0.00      0.00         1
                    4GB of RAM       0.00      0.00      0.00         0
              500gb hard drive       0.00      0.00      0.00         1
                       8GB RAM       0.00      0.00      0.00         1
                    8GB of RAM       0.00      0.00      0.00         1
                 AC power port       0.00      0.00      0.00         1
               Apple Care plan       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
