0. **initization**

In [25]:
import pandas as pd

df = pd.read_csv('/content/all_annotated.tsv', sep='\t')
df_text = df.loc[df['Definitely English'] == 1, ['Tweet']]
df_text.head()

Unnamed: 0,Tweet
2,Bed
3,I felt my first flash of violence at some fool...
4,Ladies drink and get in free till 10:30
7,Watching #Miranda On bbc1!!! @mermhart u r HIL...
9,Shopping! (@ Kohl's) http://t.co/I8ZkQHT9


1. **lowercasing**

In [27]:
df_text['Tweet'] = df_text['Tweet'].str.lower()
df_text.head()

Unnamed: 0,Tweet
2,bed
3,i felt my first flash of violence at some fool...
4,ladies drink and get in free till 10:30
7,watching #miranda on bbc1!!! @mermhart u r hil...
9,shopping! (@ kohl's) http://t.co/i8zkqht9


2. Remove Extra **Whitespaces**

In [28]:
def remove_whitespace(text):
    return  " ".join(text.split())

df_text['Tweet'] = df['Tweet'].apply(remove_whitespace)
df_text.head()

Unnamed: 0,Tweet
2,Bed
3,I felt my first flash of violence at some fool...
4,Ladies drink and get in free till 10:30
7,Watching #Miranda On bbc1!!! @mermhart u r HIL...
9,Shopping! (@ Kohl's) http://t.co/I8ZkQHT9


3. **Tokenization**

    need to download nltk first

In [29]:
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize 

df_text['Tweet'] = df_text['Tweet'].apply(lambda X: word_tokenize(X))
df_text.head()

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Pac

Unnamed: 0,Tweet
2,[Bed]
3,"[I, felt, my, first, flash, of, violence, at, ..."
4,"[Ladies, drink, and, get, in, free, till, 10:30]"
7,"[Watching, #, Miranda, On, bbc1, !, !, !, @, m..."
9,"[Shopping, !, (, @, Kohl, 's, ), http, :, //t...."


4. **Spelling Correction**

  ...need to install pyspellchecker

In [30]:
!pip install pyspellchecker
from spellchecker import SpellChecker

def spell_check(text):
    result = []
    spell = SpellChecker()
    for word in text:
        correct_word = spell.correction(word)
        if not (correct_word is None):
          result.append(correct_word)  
    
    return result

df_text['Tweet'] = df_text['Tweet'].apply(spell_check)
df_text.head()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Unnamed: 0,Tweet
2,[Bed]
3,"[I, felt, my, first, flash, of, violence, at, ..."
4,"[Ladies, drink, and, get, in, free, till]"
7,"[Watching, #, Miranda, On, back, !, !, !, @, m..."
9,"[Shopping, !, (, @, Kohl, is, ), help, :]"


5. **Removing Stopwords**

In [31]:
from nltk.corpus import stopwords

en_stopwords = stopwords.words('english')

def remove_stopwords(text):
    result = []
    for token in text:
        if token not in en_stopwords:
            result.append(token)
            
    return result

df_text['Tweet'] = df_text['Tweet'].apply(remove_stopwords)
df_text.head()

Unnamed: 0,Tweet
2,[Bed]
3,"[I, felt, first, flash, violence, fool, bumped..."
4,"[Ladies, drink, get, free, till]"
7,"[Watching, #, Miranda, On, back, !, !, !, @, m..."
9,"[Shopping, !, (, @, Kohl, ), help, :]"


6. **Removing Punctuations**

In [32]:
from nltk.tokenize import RegexpTokenizer

def remove_punct(text):
    tokenizer = RegexpTokenizer(r"\w+")
    lst = tokenizer.tokenize(' '.join(text))
    return lst

df_text['Tweet'] = df_text['Tweet'].apply(remove_punct)
df_text.head()

Unnamed: 0,Tweet
2,[Bed]
3,"[I, felt, first, flash, violence, fool, bumped..."
4,"[Ladies, drink, get, free, till]"
7,"[Watching, Miranda, On, back, merchant, u, HIL..."
9,"[Shopping, Kohl, help]"


7. **Removing Frequent Words**

In [33]:
from nltk import FreqDist

def frequent_words(df):
    
    lst=[]
    for text in df.values:
        lst+=text[0]
    fdist=FreqDist(lst)
    return fdist.most_common(10)

freq_words = frequent_words(df_text)

lst = []
for a,b in freq_words:
    lst.append(b)

def remove_freq_words(text):
    
    result=[]
    for item in text:
        if item not in lst:
            result.append(item)
    
    return result
    
df_text['Tweet'] = df_text['Tweet'].apply(remove_freq_words)
df_text.head()

Unnamed: 0,Tweet
2,[Bed]
3,"[I, felt, first, flash, violence, fool, bumped..."
4,"[Ladies, drink, get, free, till]"
7,"[Watching, Miranda, On, back, merchant, u, HIL..."
9,"[Shopping, Kohl, help]"


8. **Lemmatization**

In [34]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag

def lemmatization(text):
    result=[]
    wordnet = WordNetLemmatizer()
    for token,tag in pos_tag(text):
        pos=tag[0].lower()
        
        if pos not in ['a', 'r', 'n', 'v']:
            pos='n'
            
        result.append(wordnet.lemmatize(token,pos))
    
    return result

df_text['Tweet'] = df_text['Tweet'].apply(lemmatization)
df_text.head()

Unnamed: 0,Tweet
2,[Bed]
3,"[I, felt, first, flash, violence, fool, bump, ..."
4,"[Ladies, drink, get, free, till]"
7,"[Watching, Miranda, On, back, merchant, u, HIL..."
9,"[Shopping, Kohl, help]"


9. **Stemming**

In [35]:
from nltk.stem import PorterStemmer

def stemming(text):
    porter = PorterStemmer()
    
    result=[]
    for word in text:
        result.append(porter.stem(word))
    return result

df_text['Tweet'] = df_text['Tweet'].apply(stemming)
df_text.head()

Unnamed: 0,Tweet
2,[bed]
3,"[i, felt, first, flash, violenc, fool, bump, i..."
4,"[ladi, drink, get, free, till]"
7,"[watch, miranda, on, back, merchant, u, hilari]"
9,"[shop, kohl, help]"


10. **Removal of Tags**

In [36]:
import re
def remove_tag(text):
    text=' '.join(text)
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

df_text['Tweet'] = df_text['Tweet'].apply(remove_tag)
df_text.head()

Unnamed: 0,Tweet
2,bed
3,i felt first flash violenc fool bump i piti fool
4,ladi drink get free till
7,watch miranda on back merchant u hilari
9,shop kohl help


11. **Removal of URLs**

In [37]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

df_text['Tweet'] = df_text['Tweet'].apply(remove_tag)
df_text.head()

Unnamed: 0,Tweet
2,b e d
3,i f e l t f i r s t f l a s h v i o l ...
4,l a d i d r i n k g e t f r e e t i l l
7,w a t c h m i r a n d a o n b a c k m ...
9,s h o p k o h l h e l p


12. **ok**

In [38]:
df_text.to_csv('/content/result.csv', index=True, index_label="index")