In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/sentiment-dataset-with-1-million-tweets/dataset.csv


In [2]:
df = pd.read_csv("/kaggle/input/sentiment-dataset-with-1-million-tweets/dataset.csv")

### Tweets Cleaning with Python

In [3]:
df.head()

Unnamed: 0,Text,Language,Label
0,@Charlie_Corley @Kristine1G @amyklobuchar @Sty...,en,litigious
1,#BadBunny: Como dos gotas de agua: Joven se di...,es,negative
2,https://t.co/YJNiO0p1JV Flagstar Bank disclose...,en,litigious
3,Rwanda is set to host the headquarters of Unit...,en,positive
4,OOPS. I typed her name incorrectly (today’s br...,en,litigious


#### Lowercasing all the letters

In [4]:
df['Clean'] = df.apply(lambda row: row['Text'].lower(),axis=1)

In [5]:
df[['Text','Clean']].head()

Unnamed: 0,Text,Clean
0,@Charlie_Corley @Kristine1G @amyklobuchar @Sty...,@charlie_corley @kristine1g @amyklobuchar @sty...
1,#BadBunny: Como dos gotas de agua: Joven se di...,#badbunny: como dos gotas de agua: joven se di...
2,https://t.co/YJNiO0p1JV Flagstar Bank disclose...,https://t.co/yjnio0p1jv flagstar bank disclose...
3,Rwanda is set to host the headquarters of Unit...,rwanda is set to host the headquarters of unit...
4,OOPS. I typed her name incorrectly (today’s br...,oops. i typed her name incorrectly (today’s br...


#### Removing hashtags and mentions

In [6]:
import re

In [7]:
df['Clean'] = df.apply(lambda row: re.sub("@[A-Za-z0-9_]+","", row['Clean']),axis=1)

In [8]:
df[['Text','Clean']].head()

Unnamed: 0,Text,Clean
0,@Charlie_Corley @Kristine1G @amyklobuchar @Sty...,testimony is not evidence in a court of la...
1,#BadBunny: Como dos gotas de agua: Joven se di...,#badbunny: como dos gotas de agua: joven se di...
2,https://t.co/YJNiO0p1JV Flagstar Bank disclose...,https://t.co/yjnio0p1jv flagstar bank disclose...
3,Rwanda is set to host the headquarters of Unit...,rwanda is set to host the headquarters of unit...
4,OOPS. I typed her name incorrectly (today’s br...,oops. i typed her name incorrectly (today’s br...


In [9]:
df['Clean'] = df.apply(lambda row: re.sub("#[A-Za-z0-9_]+","", row['Clean']),axis=1)

In [10]:
df[['Text','Clean']].head()

Unnamed: 0,Text,Clean
0,@Charlie_Corley @Kristine1G @amyklobuchar @Sty...,testimony is not evidence in a court of la...
1,#BadBunny: Como dos gotas de agua: Joven se di...,: como dos gotas de agua: joven se disfraza de...
2,https://t.co/YJNiO0p1JV Flagstar Bank disclose...,https://t.co/yjnio0p1jv flagstar bank disclose...
3,Rwanda is set to host the headquarters of Unit...,rwanda is set to host the headquarters of unit...
4,OOPS. I typed her name incorrectly (today’s br...,oops. i typed her name incorrectly (today’s br...


#### Removing links

In [11]:
df['Clean'] = df.apply(lambda row: re.sub(r"http\S+","", row['Clean']),axis=1)

In [12]:
df['Clean'] = df.apply(lambda row: re.sub(r"www.\S+","", row['Clean']),axis=1)

In [13]:
df[['Text','Clean']].head()

Unnamed: 0,Text,Clean
0,@Charlie_Corley @Kristine1G @amyklobuchar @Sty...,testimony is not evidence in a court of la...
1,#BadBunny: Como dos gotas de agua: Joven se di...,: como dos gotas de agua: joven se disfraza de...
2,https://t.co/YJNiO0p1JV Flagstar Bank disclose...,flagstar bank discloses a data breach that im...
3,Rwanda is set to host the headquarters of Unit...,rwanda is set to host the headquarters of unit...
4,OOPS. I typed her name incorrectly (today’s br...,oops. i typed her name incorrectly (today’s br...


#### Removing punctuations

In [14]:
df['Clean'] = df.apply(lambda row: re.sub('[()!?]'," ", row['Clean']),axis=1)

In [15]:
df['Clean'] = df.apply(lambda row: re.sub('\[.*?\]'," ", row['Clean']),axis=1)

In [16]:
df[['Text','Clean']].head(15)

Unnamed: 0,Text,Clean
0,@Charlie_Corley @Kristine1G @amyklobuchar @Sty...,testimony is not evidence in a court of la...
1,#BadBunny: Como dos gotas de agua: Joven se di...,: como dos gotas de agua: joven se disfraza de...
2,https://t.co/YJNiO0p1JV Flagstar Bank disclose...,flagstar bank discloses a data breach that im...
3,Rwanda is set to host the headquarters of Unit...,rwanda is set to host the headquarters of unit...
4,OOPS. I typed her name incorrectly (today’s br...,oops. i typed her name incorrectly today’s br...
5,It sucks for me since I'm focused on the natur...,it sucks for me since i'm focused on the natur...
6,@en_font Treballar a l’obra a partir dels 19 a...,treballar a l’obra a partir dels 19 anys fins...
7,@ShawnTarloff @itsmieu you can also relate thi...,you can also relate this to art too a lot...
8,Social Security. Constant political crises dis...,social security. constant political crises dis...
9,@FilmThePoliceLA A broken rib can puncture a l...,a broken rib can puncture a lung or lead to a...


#### Filtering non-alphanumeric characters

In [17]:
df['Clean'] = df.apply(lambda row: re.sub("[^a-z0-9]"," ", row['Clean']),axis=1)

In [18]:
df[['Text','Clean']].head(20)

Unnamed: 0,Text,Clean
0,@Charlie_Corley @Kristine1G @amyklobuchar @Sty...,testimony is not evidence in a court of la...
1,#BadBunny: Como dos gotas de agua: Joven se di...,como dos gotas de agua joven se disfraza de...
2,https://t.co/YJNiO0p1JV Flagstar Bank disclose...,flagstar bank discloses a data breach that im...
3,Rwanda is set to host the headquarters of Unit...,rwanda is set to host the headquarters of unit...
4,OOPS. I typed her name incorrectly (today’s br...,oops i typed her name incorrectly today s br...
5,It sucks for me since I'm focused on the natur...,it sucks for me since i m focused on the natur...
6,@en_font Treballar a l’obra a partir dels 19 a...,treballar a l obra a partir dels 19 anys fins...
7,@ShawnTarloff @itsmieu you can also relate thi...,you can also relate this to art too a lot...
8,Social Security. Constant political crises dis...,social security constant political crises dis...
9,@FilmThePoliceLA A broken rib can puncture a l...,a broken rib can puncture a lung or lead to a...


#### Tokenization


In [19]:
df['Clean'] = df.apply(lambda row: row['Clean'].split(),axis=1)

In [20]:
df[['Text','Clean']].head(20)

Unnamed: 0,Text,Clean
0,@Charlie_Corley @Kristine1G @amyklobuchar @Sty...,"[testimony, is, not, evidence, in, a, court, o..."
1,#BadBunny: Como dos gotas de agua: Joven se di...,"[como, dos, gotas, de, agua, joven, se, disfra..."
2,https://t.co/YJNiO0p1JV Flagstar Bank disclose...,"[flagstar, bank, discloses, a, data, breach, t..."
3,Rwanda is set to host the headquarters of Unit...,"[rwanda, is, set, to, host, the, headquarters,..."
4,OOPS. I typed her name incorrectly (today’s br...,"[oops, i, typed, her, name, incorrectly, today..."
5,It sucks for me since I'm focused on the natur...,"[it, sucks, for, me, since, i, m, focused, on,..."
6,@en_font Treballar a l’obra a partir dels 19 a...,"[treballar, a, l, obra, a, partir, dels, 19, a..."
7,@ShawnTarloff @itsmieu you can also relate thi...,"[you, can, also, relate, this, to, art, too, a..."
8,Social Security. Constant political crises dis...,"[social, security, constant, political, crises..."
9,@FilmThePoliceLA A broken rib can puncture a l...,"[a, broken, rib, can, puncture, a, lung, or, l..."


### Functions

In [21]:
def remove_emoji(string):
        emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', string) 


def clean_tweet(tweet):
    if type(tweet) == np.float:
        return ""
    temp = tweet.lower()
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#","", temp)
    temp = remove_emoji(temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    temp = temp.split()
    return temp

In [22]:
tweet = "I am Python Programmer🙈🙉🙊 don't ask me to hack facebook account🙏"

In [23]:
clean_tweet(tweet)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


['i',
 'am',
 'python',
 'programmer',
 'dont',
 'ask',
 'me',
 'to',
 'hack',
 'facebook',
 'account']