# Modules 

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import re 
import string 
import nltk
import warnings
%matplotlib inline 

warnings.filterwarnings('ignore')

In [47]:
df = pd.read_csv('twitter sentimental.csv')
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [48]:
# datatype info 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


#Preprocessing the data
1) removing user handle 
2) removing special characters , numbers and punctuations 
3) removing 2 length word , stop words have been removed in feature extraction 

In [49]:
def remove_pattern(input_txt,pattern):
    r = re.findall(pattern,input_txt)
    for word in r:
        input_txt = re.sub(word,"",input_txt)
    return input_txt

In [50]:
df['clean_tweet'] = np.vectorize(remove_pattern)(df['tweet'],"@[\w]*")

In [51]:
df.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause th...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation


In [52]:
df['clean_tweet'] = df['clean_tweet'].str.replace("[^a-zA-Z#]"," ")
df.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can t use cause th...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation,factsguide society now #motivation


In [53]:
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: " ".join([w for w in x.split() if len(w)>3]))
df.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when father dysfunctional selfish drags kids i...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks #lyft credit cause they offer wheelchai...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model love take with time
4,5,0,factsguide: society now #motivation,factsguide society #motivation


In [54]:
df.drop('cleean_tweet', inplace=True, axis=1)

KeyError: "['cleean_tweet'] not found in axis"

In [None]:
df.head()

In [None]:
# tokenization 
tokenized_tweet = df['clean_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

In [None]:
# stemming 
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
tokenized_tweet = tokenized_tweet.apply(lambda sentence: [stemmer.stem(word) for word in sentence])
tokenized_tweet.head()

In [None]:
# combine into single sentence 
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = " ".join(tokenized_tweet[i])
df['clean_tweet'] = tokenized_tweet
df.head()

# EDA

In [None]:
# Frequent words 
all_words = " ".join([sentence for sentence in df['clean_tweet']])
from wordcloud import WordCloud
wordcloud = WordCloud(width = 800, height = 500 , random_state=42, max_font_size = 100).generate(all_words)

#graph
plt.figure(figsize=(15,8))
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
#frequent words visualization for +ve 
all_words = " ".join([sentence for sentence in df['clean_tweet'][df['label']==0]])
wordcloud = WordCloud(width = 800, height = 500 , random_state=42, max_font_size = 100).generate(all_words)

#graph
plt.figure(figsize=(15,8))
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
#frequent words visualization for -ve 
all_words = " ".join([sentence for sentence in df['clean_tweet'][df['label']==1]])
wordcloud = WordCloud(width = 800, height = 500 , random_state=42, max_font_size = 100).generate(all_words)

#graph
plt.figure(figsize=(15,8))
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# extract the hashtag 
def hashtag_extract(tweets):
    hashtags = []
    # loop words in the tweet 
    for tweet in tweets:
        ht = re.findall(r"#(\w+)",tweet)
        hashtags.append(ht)
    return hashtags

In [None]:
# extract hashtags from non racist/sexist tweets
ht_positive = hashtag_extract(df['clean_tweet'][df['label']==0])

# extract hashtags from racist/sexist tweets
ht_negative = hashtag_extract(df['clean_tweet'][df['label']==1])


In [None]:
# combining in singlee list
ht_positive = sum(ht_positive,[])
ht_negative = sum(ht_negative,[])

In [None]:
ht_positive[:5]

In [None]:
freq = nltk.FreqDist(ht_positive)
d = pd.DataFrame({'Hashtag': list(freq.keys()),
                  'Count': list(freq.values())})
d.head()

In [None]:
# select top 10 hashtags 
d = d.nlargest(columns='Count',n=10)
plt.figure(figsize=(15,9))
sns.barplot(data=d,x='Hashtag',y='Count')
plt.show()

In [None]:
# now for negative 
freq = nltk.FreqDist(ht_negative)
d = pd.DataFrame({'Hashtag': list(freq.keys()),
                  'Count': list(freq.values())})
d.head()

In [None]:
d = d.nlargest(columns='Count',n=10)
plt.figure(figsize=(15,9))
sns.barplot(data=d,x='Hashtag',y='Count')
plt.show()

In [None]:
#input split 
from sklearn.feature_extraction.text import CountVectorizer
# bow is for bag of words 
bow_vectorizer = CountVectorizer(max_df = 0.90, min_df=2, max_features=1000,stop_words='english') 
bow = bow_vectorizer.fit_transform(df['clean_tweet'])


In [None]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(bow,df['label'],random_state=42,test_size=0.25)

# Model Training 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score , accuracy_score

In [None]:
#training 
model = LogisticRegression()
model.fit(x_train,y_train)

In [None]:
#testing 
pred = model.predict(x_test)
f1_score(y_test,pred)

In [None]:
accuracy_score(y_test,pred)

In [None]:
# using probability 
pred_prob = model.predict_proba(x_test)
pred = pred_prob[:,1] >= 0.3
pred = pred.astype(np.int)
f1_score(y_test,pred)