In [138]:
import pandas as pd
import re 
import string 

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [113]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/nihar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/nihar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nihar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [114]:
data = pd.read_csv('./data/dataset.csv', encoding="latin-1")

In [115]:
data.head()

Unnamed: 0,ï»¿Item ID,Sentiment,SentimentText
0,1,0,"@RailMinIndia My PNR is 8348062961, I am in wa..."
1,2,0,@sureshpprabhu @RailMinIndia AC not working in...
2,3,0,@RailMinIndia I'm traveling to chennai by trai...
3,4,5,@RailMinIndia irctc is not responding at the t...
4,5,7,@DRMbhopal @RailMinIndia @sanjaygupta2012 @drm...


In [116]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1366 entries, 0 to 1365
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ï»¿Item ID     1366 non-null   int64 
 1   Sentiment      1366 non-null   int64 
 2   SentimentText  1366 non-null   object
dtypes: int64(2), object(1)
memory usage: 32.1+ KB


In [117]:
data.isnull().sum()

ï»¿Item ID       0
Sentiment        0
SentimentText    0
dtype: int64

In [118]:
data.rename(columns={"ï»¿Item ID": "Item ID"}, inplace=True)

In [119]:
data.head()

Unnamed: 0,Item ID,Sentiment,SentimentText
0,1,0,"@RailMinIndia My PNR is 8348062961, I am in wa..."
1,2,0,@sureshpprabhu @RailMinIndia AC not working in...
2,3,0,@RailMinIndia I'm traveling to chennai by trai...
3,4,5,@RailMinIndia irctc is not responding at the t...
4,5,7,@DRMbhopal @RailMinIndia @sanjaygupta2012 @drm...


In [120]:
data['SentimentText_Copy'] = data['SentimentText']  

In [121]:
data.head()

Unnamed: 0,Item ID,Sentiment,SentimentText,SentimentText_Copy
0,1,0,"@RailMinIndia My PNR is 8348062961, I am in wa...","@RailMinIndia My PNR is 8348062961, I am in wa..."
1,2,0,@sureshpprabhu @RailMinIndia AC not working in...,@sureshpprabhu @RailMinIndia AC not working in...
2,3,0,@RailMinIndia I'm traveling to chennai by trai...,@RailMinIndia I'm traveling to chennai by trai...
3,4,5,@RailMinIndia irctc is not responding at the t...,@RailMinIndia irctc is not responding at the t...
4,5,7,@DRMbhopal @RailMinIndia @sanjaygupta2012 @drm...,@DRMbhopal @RailMinIndia @sanjaygupta2012 @drm...


In [122]:
# df['SentimentText_Copy'] = df['SentimentText_Copy'].str.replace(r'@\w+', '', regex=True).str.strip()
#df['SentimentText_Copy'] = df['SentimentText_Copy'].str.replace(r'\d+', '', regex=True).str.strip()
#df['SentimentText_Copy'] = df['SentimentText_Copy'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True).str.strip() 

In [123]:
text = re.sub(r'@\w+' , '' , '@RailMinIndia My PNR is 8348062961, I am in way but there is no water in toilet and mess is everywhere in coach. please provide basic facil.')
print(text)

 My PNR is 8348062961, I am in way but there is no water in toilet and mess is everywhere in coach. please provide basic facil.


In [124]:
text = re.sub(r'\d+', '', text)
print(text)

 My PNR is , I am in way but there is no water in toilet and mess is everywhere in coach. please provide basic facil.


In [125]:
text = text.translate(str.maketrans('', '', string.punctuation))
print(text)

 My PNR is  I am in way but there is no water in toilet and mess is everywhere in coach please provide basic facil


In [126]:
text = text.lower()
print(text)

 my pnr is  i am in way but there is no water in toilet and mess is everywhere in coach please provide basic facil


In [127]:
tokens = word_tokenize(text)
print(tokens)

['my', 'pnr', 'is', 'i', 'am', 'in', 'way', 'but', 'there', 'is', 'no', 'water', 'in', 'toilet', 'and', 'mess', 'is', 'everywhere', 'in', 'coach', 'please', 'provide', 'basic', 'facil']


In [128]:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

In [129]:
print(type(stop_words))

<class 'list'>


In [130]:
cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
print(cleaned_tokens)

['pnr', 'way', 'water', 'toilet', 'mess', 'everywhere', 'coach', 'please', 'provide', 'basic', 'facil']


In [131]:
def clean_text(text):
    # handeling the null value
    if pd.isna(text):
        return ""
    
    text = re.sub(r'@\w+' , '', text) # removing mentions (@)
    text = re.sub(r'\d+','',text) # removing numbers
    text = re.sub(r"[^\w\s]",'',text) # removing punctuation 
    
    # removing special characters and punctuation (!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)
    # cleaned_text = text.translate(str.maketrans('','',string.punctuation))
    
    # text.lower()
        
    
    #tokeniztion
    tokens  = word_tokenize(text)
    
    # lemmentization (converting words to their root form)
    cleaned_tokens= [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    cleaned_token = [word.lower() for word in cleaned_tokens ]
    
    return ' '.join(cleaned_token)
        

In [29]:
#for sentence in data['SentimentText']:
 #   data['clean'] = clean_text(sentence)

In [132]:
data['Clean_text'] = data['SentimentText'].apply(clean_text)

In [133]:
print(data['Clean_text'])

0       my pnr i way water toilet mess everywhere coac...
1       ac working prayag lucknow intercity today depa...
2       im traveling chennai train coach s my berth di...
3                    irctc responding time tatkal booking
4                      matter notified concerned official
                              ...                        
1361    an aged lady alloted upper berth train pnr kin...
1362    aggressive copassengers need immediate change ...
1363    water coachplease somethingtrain coach bjammut...
1364    while travelling train noname duronto expressu...
1365                                      httpstcofpfsdcj
Name: Clean_text, Length: 1366, dtype: object


In [134]:
data.head()

Unnamed: 0,Item ID,Sentiment,SentimentText,SentimentText_Copy,Clean_text
0,1,0,"@RailMinIndia My PNR is 8348062961, I am in wa...","@RailMinIndia My PNR is 8348062961, I am in wa...",my pnr i way water toilet mess everywhere coac...
1,2,0,@sureshpprabhu @RailMinIndia AC not working in...,@sureshpprabhu @RailMinIndia AC not working in...,ac working prayag lucknow intercity today depa...
2,3,0,@RailMinIndia I'm traveling to chennai by trai...,@RailMinIndia I'm traveling to chennai by trai...,im traveling chennai train coach s my berth di...
3,4,5,@RailMinIndia irctc is not responding at the t...,@RailMinIndia irctc is not responding at the t...,irctc responding time tatkal booking
4,5,7,@DRMbhopal @RailMinIndia @sanjaygupta2012 @drm...,@DRMbhopal @RailMinIndia @sanjaygupta2012 @drm...,matter notified concerned official


In [135]:
data = data.drop(columns='SentimentText_Copy')

In [136]:
data.head()

Unnamed: 0,Item ID,Sentiment,SentimentText,Clean_text
0,1,0,"@RailMinIndia My PNR is 8348062961, I am in wa...",my pnr i way water toilet mess everywhere coac...
1,2,0,@sureshpprabhu @RailMinIndia AC not working in...,ac working prayag lucknow intercity today depa...
2,3,0,@RailMinIndia I'm traveling to chennai by trai...,im traveling chennai train coach s my berth di...
3,4,5,@RailMinIndia irctc is not responding at the t...,irctc responding time tatkal booking
4,5,7,@DRMbhopal @RailMinIndia @sanjaygupta2012 @drm...,matter notified concerned official


In [111]:
# print(clean_text('@RailMinIndia  TRAIN NO 12656 THERE IS NO PROPER WATER SERVICE IN TRAIN KINDLY LOOK INTO MATTER.. https://t.co/VIQmURi8HK'))

train no there is no proper water service in train kindly look into matter httpstcoviqmurihk


In [139]:
tfidf_vectorizer = TfidfVectorizer(max_features = 5000 , stop_words = 'english')

In [140]:
X_tfidf = tfidf_vectorizer.fit_transform(data['Clean_text'])

X_tfidf = X_tfidf.toarray()

column_names = tfidf_vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(X_tfidf , columns=column_names)



In [144]:
tfidf_df.head()

Unnamed: 0,aadhar,aaj,aap,aapne,aati,aaye,aayen,ab,abhi,abhiyaan,abhiyan,able,absolutely,absurd,abt,abu,abundance,abusing,ac,academic,accept,acceptance,accepted,accident,according,account,accountable,accumulated,achieved,acnd,act,action,active,activity,actual,add,addicated,additional,address,adi,...,wter,wth,wtht,wtng,wwrbct,wâ,xerox,xonal,xpress,xray,ya,yah,yashwantpur,yatra,yatri,yday,ye,year,yes,yesterday,yesvantpur,yojna,youdisappointed,youhttpstcovvmxawns,young,younger,ypr,yr,ystrdy,yt,yur,yuva,zee,ziyarat,zonal,zone,âïâïâïâï,ðhighlander,ðquirky,ðââï
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.180281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [146]:
tfidf_df.shape

(1366, 3884)

In [147]:
data.head()

Unnamed: 0,Item ID,Sentiment,SentimentText,Clean_text
0,1,0,"@RailMinIndia My PNR is 8348062961, I am in wa...",my pnr i way water toilet mess everywhere coac...
1,2,0,@sureshpprabhu @RailMinIndia AC not working in...,ac working prayag lucknow intercity today depa...
2,3,0,@RailMinIndia I'm traveling to chennai by trai...,im traveling chennai train coach s my berth di...
3,4,5,@RailMinIndia irctc is not responding at the t...,irctc responding time tatkal booking
4,5,7,@DRMbhopal @RailMinIndia @sanjaygupta2012 @drm...,matter notified concerned official


In [148]:
tfidf_df['label'] = data['Sentiment']

In [149]:
tfidf_df.head()

Unnamed: 0,aadhar,aaj,aap,aapne,aati,aaye,aayen,ab,abhi,abhiyaan,abhiyan,able,absolutely,absurd,abt,abu,abundance,abusing,ac,academic,accept,acceptance,accepted,accident,according,account,accountable,accumulated,achieved,acnd,act,action,active,activity,actual,add,addicated,additional,address,adi,...,wth,wtht,wtng,wwrbct,wâ,xerox,xonal,xpress,xray,ya,yah,yashwantpur,yatra,yatri,yday,ye,year,yes,yesterday,yesvantpur,yojna,youdisappointed,youhttpstcovvmxawns,young,younger,ypr,yr,ystrdy,yt,yur,yuva,zee,ziyarat,zonal,zone,âïâïâïâï,ðhighlander,ðquirky,ðââï,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.180281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7


In [150]:
# tfidf_df.to_csv('./data/tfidf.csv')