## Import Module

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import regex as re
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

## Read Data

In [None]:
df = pd.read_csv('tweets.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1.0
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1.0
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1.0
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1.0
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0.0


In [None]:
df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7350 entries, 0 to 7349
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        7350 non-null   int64  
 1   keyword   7350 non-null   object 
 2   location  5168 non-null   object 
 3   text      7349 non-null   object 
 4   target    7349 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 287.2+ KB


In [None]:
df.set_index('id', inplace = True)

### Cleaning

In [None]:
df.drop(columns=['keyword', 'location'], axis=1, inplace = True)

In [None]:
df[df.duplicated()]

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
103,thinking about how some1 spat blood at me duri...,0.0
292,[AUS] Vic Bushfire Appeal | Doggos | Animal Fa...,0.0
771,The human cull (from 38mins30secs https://t.co...,0.0
787,Does the push for mass vaccination point towar...,0.0
808,Does the push for vaccination point to a stage...,0.0
...,...,...
7062,We wanted to entertain you all with a good mov...,0.0
7063,We wanted to entertain you all with a good mov...,0.0
7066,We wanted to entertain you all with a good mov...,0.0
7304,Do not repeat this malign behaviour. #Iran mas...,0.0


In [None]:
df.drop_duplicates()

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"Communal violence in Bhainsa, Telangana. ""Ston...",1.0
1,Telangana: Section 144 has been imposed in Bha...,1.0
2,Arsonist sets cars ablaze at dealership https:...,1.0
3,Arsonist sets cars ablaze at dealership https:...,1.0
4,"""Lord Jesus, your love brings freedom and pard...",0.0
...,...,...
7345,#1495Days Since #ZariaMassacre and the illegal...,1.0
7346,THE LIBERAL ANTI AMERICAN MEDIA SUPPORTS THE M...,0.0
7347,Iranian people have a great deal of respect fo...,0.0
7348,Mass murderer.,0.0


In [None]:
df.isnull().sum()

id        0
text      1
target    1
dtype: int64

In [None]:
df.dropna(subset=['text', 'target'], inplace=True)

## Word Preprocessing

In [None]:
def preprocess(text):
    # Check if the input is a string
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        text = re.sub(r'https://\S+', '', text)
        text = re.sub(r'#', '', text)
        return text
    else:
        # If not a string
        return str(text)

In [None]:
df['clean_text'] = df['text'].apply(preprocess)

In [None]:
# To verify if it's working correctly
df

Unnamed: 0_level_0,text,target,clean_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"Communal violence in Bhainsa, Telangana. ""Ston...",1.0,communal violence in bhainsa telangana stones ...
1,Telangana: Section 144 has been imposed in Bha...,1.0,telangana section 144 has been imposed in bhai...
2,Arsonist sets cars ablaze at dealership https:...,1.0,arsonist sets cars ablaze at dealership httpst...
3,Arsonist sets cars ablaze at dealership https:...,1.0,arsonist sets cars ablaze at dealership httpst...
4,"""Lord Jesus, your love brings freedom and pard...",0.0,lord jesus your love brings freedom and pardon...
...,...,...,...
7344,1495Days Since the inhuman #ZariaGenocide by w...,1.0,1495days since the inhuman zariagenocide by wi...
7345,#1495Days Since #ZariaMassacre and the illegal...,1.0,1495days since zariamassacre and the illegal d...
7346,THE LIBERAL ANTI AMERICAN MEDIA SUPPORTS THE M...,0.0,the liberal anti american media supports the m...
7347,Iranian people have a great deal of respect fo...,0.0,iranian people have a great deal of respect fo...


## The NLP Thing (Tokenization and Stop Words)

In [None]:
# Tokenizing
nltk.download('punkt')
df['tokens'] = df['clean_text'].apply(nltk.word_tokenize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df

Unnamed: 0_level_0,text,target,clean_text,tokens
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"Communal violence in Bhainsa, Telangana. ""Ston...",1.0,communal violence in bhainsa telangana stones ...,"[communal, violence, in, bhainsa, telangana, s..."
1,Telangana: Section 144 has been imposed in Bha...,1.0,telangana section 144 has been imposed in bhai...,"[telangana, section, 144, has, been, imposed, ..."
2,Arsonist sets cars ablaze at dealership https:...,1.0,arsonist sets cars ablaze at dealership httpst...,"[arsonist, sets, cars, ablaze, at, dealership,..."
3,Arsonist sets cars ablaze at dealership https:...,1.0,arsonist sets cars ablaze at dealership httpst...,"[arsonist, sets, cars, ablaze, at, dealership,..."
4,"""Lord Jesus, your love brings freedom and pard...",0.0,lord jesus your love brings freedom and pardon...,"[lord, jesus, your, love, brings, freedom, and..."
...,...,...,...,...
7344,1495Days Since the inhuman #ZariaGenocide by w...,1.0,1495days since the inhuman zariagenocide by wi...,"[1495days, since, the, inhuman, zariagenocide,..."
7345,#1495Days Since #ZariaMassacre and the illegal...,1.0,1495days since zariamassacre and the illegal d...,"[1495days, since, zariamassacre, and, the, ill..."
7346,THE LIBERAL ANTI AMERICAN MEDIA SUPPORTS THE M...,0.0,the liberal anti american media supports the m...,"[the, liberal, anti, american, media, supports..."
7347,Iranian people have a great deal of respect fo...,0.0,iranian people have a great deal of respect fo...,"[iranian, people, have, a, great, deal, of, re..."


In [None]:
# Stop Words
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Function to remove stop words
cleaner_words = [w for w in df['tokens'] if not w.lower() in stop_words]
cleaner_words = []
for words in df['tokens']:
  if words not in stop_words:
    cleaner_words.append(words)
df['cleaner_text'] = cleaner_words

In [None]:
df

Unnamed: 0_level_0,text,target,clean_text,tokens,cleaner_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"Communal violence in Bhainsa, Telangana. ""Ston...",1.0,communal violence in bhainsa telangana stones ...,"[communal, violence, in, bhainsa, telangana, s...","[communal, violence, in, bhainsa, telangana, s..."
1,Telangana: Section 144 has been imposed in Bha...,1.0,telangana section 144 has been imposed in bhai...,"[telangana, section, 144, has, been, imposed, ...","[telangana, section, 144, has, been, imposed, ..."
2,Arsonist sets cars ablaze at dealership https:...,1.0,arsonist sets cars ablaze at dealership httpst...,"[arsonist, sets, cars, ablaze, at, dealership,...","[arsonist, sets, cars, ablaze, at, dealership,..."
3,Arsonist sets cars ablaze at dealership https:...,1.0,arsonist sets cars ablaze at dealership httpst...,"[arsonist, sets, cars, ablaze, at, dealership,...","[arsonist, sets, cars, ablaze, at, dealership,..."
4,"""Lord Jesus, your love brings freedom and pard...",0.0,lord jesus your love brings freedom and pardon...,"[lord, jesus, your, love, brings, freedom, and...","[lord, jesus, your, love, brings, freedom, and..."
...,...,...,...,...,...
7344,1495Days Since the inhuman #ZariaGenocide by w...,1.0,1495days since the inhuman zariagenocide by wi...,"[1495days, since, the, inhuman, zariagenocide,...","[1495days, since, the, inhuman, zariagenocide,..."
7345,#1495Days Since #ZariaMassacre and the illegal...,1.0,1495days since zariamassacre and the illegal d...,"[1495days, since, zariamassacre, and, the, ill...","[1495days, since, zariamassacre, and, the, ill..."
7346,THE LIBERAL ANTI AMERICAN MEDIA SUPPORTS THE M...,0.0,the liberal anti american media supports the m...,"[the, liberal, anti, american, media, supports...","[the, liberal, anti, american, media, supports..."
7347,Iranian people have a great deal of respect fo...,0.0,iranian people have a great deal of respect fo...,"[iranian, people, have, a, great, deal, of, re...","[iranian, people, have, a, great, deal, of, re..."
