In [8]:
import pandas as pd
df=pd.read_csv('all_annotated.tsv',sep='\t')
df_text=df[['Tweet']]
df_text.head()

Unnamed: 0,Tweet
0,Bugün bulusmami lazimdiii
1,Volkan konak adami tribe sokar yemin ederim :D
2,Bed
3,I felt my first flash of violence at some fool...
4,Ladies drink and get in free till 10:30


## i) Lowercasing

In [9]:
df_text['Tweet'] = df_text['Tweet'].str.lower()
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet'] = df_text['Tweet'].str.lower()


Unnamed: 0,Tweet
0,bugün bulusmami lazimdiii
1,volkan konak adami tribe sokar yemin ederim :d
2,bed
3,i felt my first flash of violence at some fool...
4,ladies drink and get in free till 10:30


## ii) Remove Extra Whitespaces

In [10]:
def remove_whitespace(text):
  return " ".join(text.split())

In [15]:
df_text['Tweet'] = df_text['Tweet'].apply(remove_whitespace)
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet'] = df_text['Tweet'].apply(remove_whitespace)


Unnamed: 0,Tweet
0,bugün bulusmami lazimdiii
1,volkan konak adami tribe sokar yemin ederim :d
2,bed
3,i felt my first flash of violence at some fool...
4,ladies drink and get in free till 10:30


In [19]:
pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## iii) Tokenization

In [27]:
import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> punkt


    Downloading package punkt to /root/nltk_data...
      Unzipping tokenizers/punkt.zip.



---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [28]:
from nltk import word_tokenize

df_text['Tweet'] = df_text['Tweet'].apply(lambda X: word_tokenize(X))
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet'] = df_text['Tweet'].apply(lambda X: word_tokenize(X))


Unnamed: 0,Tweet
0,"[bugün, bulusmami, lazimdiii]"
1,"[volkan, konak, adami, tribe, sokar, yemin, ed..."
2,[bed]
3,"[i, felt, my, first, flash, of, violence, at, ..."
4,"[ladies, drink, and, get, in, free, till, 10:30]"


## iv) Spelling Correction

In [39]:
pip install pyspellchecker

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspellchecker
  Downloading pyspellchecker-0.7.1-py3-none-any.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.1


In [40]:
from spellchecker import SpellChecker

In [46]:
def spell_check(text):

  result = []
  spell = SpellChecker()

  for word in text:

    correct_word = spell.correction(word)
    result.append(correct_word)

  return result

In [88]:
df_text['Tweet'] = df_text['Tweet'].apply(spell_check)
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet'] = df_text['Tweet'].apply(spell_check)


Unnamed: 0,Tweet
0,"[i, u, i, i, i, i, i, u, i, u, i, i, a, i, i, ..."
1,"[i, o, i, i, a, i, i, i, o, i, a, i, i, a, i, ..."
2,"[i, e, i]"
3,"[i, e, i, i, i, i, i, i, i, i, i, i, i, a, i, ..."
4,"[i, a, i, i, i, i, i, i, i, i, i, i, e, i, i, ..."


## v) Removing Stopwords

In [53]:
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> stopwords


    Downloading package stopwords to /root/nltk_data...
      Unzipping corpora/stopwords.zip.



---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [55]:
from nltk.corpus import stopwords

en_stopwords = stopwords.words('english')

def remove_stopwords(text):
    result = []
    for token in text:
        if token not in en_stopwords:
            result.append(token)
            
    return result

In [58]:
df_text['Tweet'] = df_text['Tweet'].apply(remove_stopwords)
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet'] = df_text['Tweet'].apply(remove_stopwords)


Unnamed: 0,Tweet
0,"[bugün, bulusmami, lazimdiii]"
1,"[volkan, konak, adami, tribe, sokar, yemin, ed..."
2,[bed]
3,"[felt, first, flash, violence, fool, bumped, ...."
4,"[ladies, drink, get, free, till, 10:30]"


## vi) Removing Punctuations

In [59]:
from nltk.tokenize import RegexpTokenizer

def remove_punct(text):
    
    tokenizer = RegexpTokenizer(r"\w+")
    lst=tokenizer.tokenize(' '.join(text))
    return lst

In [62]:
df_text['Tweet'] = df_text['Tweet'].apply(remove_punct)
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet'] = df_text['Tweet'].apply(remove_punct)


Unnamed: 0,Tweet
0,"[bugün, bulusmami, lazimdiii]"
1,"[volkan, konak, adami, tribe, sokar, yemin, ed..."
2,[bed]
3,"[felt, first, flash, violence, fool, bumped, p..."
4,"[ladies, drink, get, free, till, 10, 30]"


## vii) Removing Frequent Words

In [64]:
from nltk import FreqDist

def frequent_words(df):
    
    lst = []
    for text in df.values:
        lst += text[0]
    fdist = FreqDist(lst)
    
    return fdist.most_common(10)

In [65]:
def remove_freq_words(text):
    
    result=[]
    for item in text:
        if item not in lst:
            result.append(item)
    
    return result

In [67]:
frequent_words(df_text)

freq_words = frequent_words(df_text)

lst = []
for a,b in freq_words:
    lst.append(b)

In [69]:
df_text['Tweet'] = df_text['Tweet'].apply(remove_freq_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet'] = df_text['Tweet'].apply(remove_freq_words)


In [89]:
df_text.head()

Unnamed: 0,Tweet
0,"[i, u, i, i, i, i, i, u, i, u, i, i, a, i, i, ..."
1,"[i, o, i, i, a, i, i, i, o, i, a, i, i, a, i, ..."
2,"[i, e, i]"
3,"[i, e, i, i, i, i, i, i, i, i, i, i, i, a, i, ..."
4,"[i, a, i, i, i, i, i, i, i, i, i, i, e, i, i, ..."


## viii) Lemmatization

In [70]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag

In [80]:
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> omw-1.4


    Downloading package omw-1.4 to /root/nltk_data...



---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [76]:
def lemmatization(text):
    
    result = []
    wordnet = WordNetLemmatizer()
    for token, tag in pos_tag(text):
        pos=tag[0].lower()
        
        if pos not in ['a', 'r', 'n', 'v']:
            pos='n'
            
        result.append(wordnet.lemmatize(token,pos))
    
    return result

In [81]:
df_text['Tweet'] = df_text['Tweet'].apply(lemmatization)
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet'] = df_text['Tweet'].apply(lemmatization)


Unnamed: 0,Tweet
0,"[bugün, bulusmami, lazimdiii]"
1,"[volkan, konak, adami, tribe, sokar, yemin, ed..."
2,[bed]
3,"[felt, first, flash, violence, fool, bump, pit..."
4,"[lady, drink, get, free, till, 10, 30]"


## ix) Stemming

In [82]:
from nltk.stem import PorterStemmer

def stemming(text):
    porter = PorterStemmer()
    
    result=[]
    for word in text:
        result.append(porter.stem(word))
    return result

In [83]:
df_text['Tweet'] = df_text['Tweet'].apply(stemming)
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet'] = df_text['Tweet'].apply(stemming)


Unnamed: 0,Tweet
0,"[bugün, bulusmami, lazimdiii]"
1,"[volkan, konak, adami, tribe, sokar, yemin, ed..."
2,[bed]
3,"[felt, first, flash, violenc, fool, bump, piti..."
4,"[ladi, drink, get, free, till, 10, 30]"


## x) Removal of Tags

In [84]:
import re
def remove_tag(text):
    
    text=' '.join(text)
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [85]:
df_text['Tweet'] = df_text['Tweet'].apply(remove_tag)
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet'] = df_text['Tweet'].apply(remove_tag)


Unnamed: 0,Tweet
0,bugün bulusmami lazimdiii
1,volkan konak adami tribe sokar yemin ederim
2,bed
3,felt first flash violenc fool bump piti fool
4,ladi drink get free till 10 30


## xi) Removal of URLs

In [86]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [87]:
df_text['Tweet'] = df_text['Tweet'].apply(remove_urls)
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet'] = df_text['Tweet'].apply(remove_urls)


Unnamed: 0,Tweet
0,bugün bulusmami lazimdiii
1,volkan konak adami tribe sokar yemin ederim
2,bed
3,felt first flash violenc fool bump piti fool
4,ladi drink get free till 10 30
