In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
l=WordNetLemmatizer()
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

[nltk_data] Downloading package stopwords to C:\Users\Nischay
[nltk_data]     kapoor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Loading the Dataset
df=pd.read_csv("Education.csv")

In [3]:
df.head()

Unnamed: 0,Text,Label
0,The impact of educational reforms remains unce...,positive
1,Critics argue that recent improvements in the ...,negative
2,Innovative teaching methods have led to unexpe...,positive
3,"Despite budget constraints, the school has man...",positive
4,The true effectiveness of online learning plat...,negative


Exploratory Data Analysis (EDA)

In [4]:
df.isnull().sum()

Text     0
Label    0
dtype: int64

In [5]:
df.duplicated().sum()

0

In [6]:
df.shape

(52, 2)

In [7]:
df.describe()

Unnamed: 0,Text,Label
count,52,52
unique,52,2
top,The impact of educational reforms remains unce...,positive
freq,1,26


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    52 non-null     object
 1   Label   52 non-null     object
dtypes: object(2)
memory usage: 964.0+ bytes


In [9]:
df['Label'].value_counts()   

Label
positive    26
negative    26
Name: count, dtype: int64

 Data Preprocessing

In [10]:
import re
corpus=[]
for i in range(0,len(df)):
    words=re.sub('[^a-zA-Z]',' ',df['Text'][i])
    words=words.lower()
    words=words.split()
    words=[l.lemmatize(word) for word in words if word not in stop_words]
    words=' '.join(words)
    corpus.append(words)

In [11]:
X=corpus

In [12]:
encoder=LabelEncoder()
encoder.fit_transform(df['Label'])

array([1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0])

In [13]:
y=df['Label']

In [14]:
#Splitting the Data
x_train, x_test, y_train, y_test = train_test_split(corpus,y, test_size=0.3, random_state=42)

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
#using Tfidf and Random Forest
classifier = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', RandomForestClassifier())])
classifier.fit(x_train, y_train)

In [17]:
#Model Evaluation
y_pred_tf = classifier.predict(x_test)
accuracy_score(y_test, y_pred_tf)

0.3125

In [18]:
confusion_matrix(y_test, y_pred_tf)

array([[4, 2],
       [9, 1]], dtype=int64)

In [19]:
#using bow and Random Forest
from sklearn.feature_extraction.text import CountVectorizer
classifier = Pipeline([('bow', CountVectorizer()), ('classifier', RandomForestClassifier())])
classifier.fit(x_train, y_train)

In [20]:
y_pred_bow = classifier.predict(x_test)
accuracy_score(y_test, y_pred_bow)

0.375

In [21]:
confusion_matrix(y_test, y_pred_bow)

array([[5, 1],
       [9, 1]], dtype=int64)

In [22]:
from sklearn.naive_bayes import MultinomialNB

In [23]:
#using Tfidf and Naive Baiys
classifier = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', MultinomialNB())])
classifier.fit(x_train, y_train)

In [24]:
y_pred_nb = classifier.predict(x_test)
accuracy_score(y_test, y_pred_nb)

0.3125

In [25]:
confusion_matrix(y_test, y_pred_nb)

array([[3, 3],
       [8, 2]], dtype=int64)

In [26]:
from sklearn.svm import SVC

In [27]:
#using Tfidf and Support Vector Classifier
classifier = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', SVC())])
classifier.fit(x_train, y_train)

In [28]:
y_pred_svc = classifier.predict(x_test)
accuracy_score(y_test, y_pred_svc)

0.375

In [29]:
confusion_matrix(y_test, y_pred_svc)

array([[5, 1],
       [9, 1]], dtype=int64)

In [30]:
#using BOW and SVC
classifier = Pipeline([('bow', CountVectorizer()), ('classifier', SVC())])
classifier.fit(x_train, y_train)

In [31]:
y_pred_sv = classifier.predict(x_test)
accuracy_score(y_test, y_pred_sv)

0.3125

In [32]:
confusion_matrix(y_test, y_pred_sv)

array([[4, 2],
       [9, 1]], dtype=int64)

In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
#using Tfidf and Logistic Regression
classifier = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', LogisticRegression())])
classifier.fit(x_train, y_train)

In [35]:
y_pred_lr = classifier.predict(x_test)
accuracy_score(y_test, y_pred_lr)

0.375

In [36]:
confusion_matrix(y_test, y_pred_lr)

array([[5, 1],
       [9, 1]], dtype=int64)

In [37]:
classifier.predict(['The impact of educational reforms remains uncertain despite extensive research.'])

array(['positive'], dtype=object)