# Preprocess Data

* python script to preprocess the data using various techniques

## Install required libraries

In [1]:
!pip install emoji
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.1.0.tar.gz (216 kB)
[K     |████████████████████████████████| 216 kB 5.1 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.1.0-py3-none-any.whl size=212392 sha256=ad06fc9b4e168c5d945d02b87dc44b5845a31c25976ffeb18871b49edbb8d30f
  Stored in directory: /root/.cache/pip/wheels/77/75/99/51c2a119f4cfd3af7b49cc57e4f737bed7e40b348a85d82804
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-2.1.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 5.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.

## Import required libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import nltk
from tqdm.notebook import tqdm
import emoji
import transformers
from tokenizers import BertWordPieceTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from google.colab import drive

## Install required packages

In [3]:
nltk.download('punkt') 
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

## Read dataset

In [4]:
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp project/Dataset/biden_tweets_clean.csv')

df

Unnamed: 0.1,Unnamed: 0,Text
0,0,@RT_com That’s the guy who is funding those fa...
1,1,Biden apparently just told JTaps that he's goi...
2,2,@Kingofgameplay1 @HeathMayo They've been given...
3,3,@conorjrogers @reedgalen They could not raise ...
4,4,Can`t Biden just fire the board members on the...
...,...,...
17646,17646,"@KThomasDC @costareports That’s nice, but I ho..."
17647,17647,@livingdead1970 OMG. You are a sensitive soul...
17648,17648,@bryceagen @itsJeffTiedrich @realDonaldTrump @...
17649,17649,@Tomboliko @the_resistor @realDonaldTrump Hill...


In [6]:
df.columns

Index(['Unnamed: 0', 'Text'], dtype='object')

In [7]:
# drop irrelevant column(s)
df = df.drop('Unnamed: 0', axis=1)

df

Unnamed: 0,Text
0,@RT_com That’s the guy who is funding those fa...
1,Biden apparently just told JTaps that he's goi...
2,@Kingofgameplay1 @HeathMayo They've been given...
3,@conorjrogers @reedgalen They could not raise ...
4,Can`t Biden just fire the board members on the...
...,...
17646,"@KThomasDC @costareports That’s nice, but I ho..."
17647,@livingdead1970 OMG. You are a sensitive soul...
17648,@bryceagen @itsJeffTiedrich @realDonaldTrump @...
17649,@Tomboliko @the_resistor @realDonaldTrump Hill...


## Preprocess functions

In [8]:
def emoji_cleaning(text):
    
  # Change emoji to text
  text = emoji.demojize(text).replace(":", " ")
  
  # Delete repeated emoji
  tokenizer = text.split()
  repeated_list = []
  
  for word in tokenizer:
      if word not in repeated_list:
          repeated_list.append(word)
  
  text = ' '.join(text for text in repeated_list)
  text = text.replace("_", " ").replace("-", " ")
  return text

In [9]:
def clean_smileys(text):
    
    text = re.sub(r'(:\)|: \)|\(\:|:-\)|: -\)|: - \)|:D|: D)', ' smile ', text)
    text = re.sub(r'(:\(|: \(|\)\:|:-\(|: -\(|: - \(|:\'\()', ' dislike ', text)
    text = re.sub(r'(<3)', ' heart ', text)
    text = re.sub(r'(:/)', ' dislike ', text)
    text = re.sub(r'(;\)|; \))', ' wink ', text)
    return ' '.join([word for word in text.split()])

In [10]:
def clean_urls(review):
    review = review.split()
    review = ' '.join([word for word in review if not re.match('^http', word)])
    return review

In [11]:
def decontracted(text):
    text = re.sub(r"won\’t", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"can\’t", "can not", text)
    text = re.sub(r"n\’t", " not", text)
    text = re.sub(r"\’re", " are", text)
    text = re.sub(r"it\'s", "it is", text)
    text = re.sub(r"\’d", " would", text)
    text = re.sub(r"\’ll", " will", text)
    text = re.sub(r"\’t", " not", text)
    text = re.sub(r"\’ve", " have", text)
    text = re.sub(r"\’m", " am", text)

    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\’re", " are", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\’d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\’ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\’t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\’ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\’m", " am", text)
    text = re.sub(r"\“", "", text)
    text = re.sub(r"\”", "", text)
    text = re.sub(r"\…", "", text)

    return text

In [12]:
def clean_text(text):
    text = str(text)
    text = re.sub(r'(\w)\1{2,}', r'\1', text)
    text = re.sub(r'[^a-zA-Z ]+', ' ', text)
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'pic.twitter\S+', ' ', text)
    text = re.sub(r'#', '', text)
    text = text.lower()

    return text

## Preprocess tweets

In [13]:
stop_words = stopwords.words('english') 
stop_words.remove('not') 
lemmatizer = WordNetLemmatizer()

In [14]:
def data_preprocessing(review):
    
  # data cleaning
    review = re.sub(re.compile('<.*?>'), '', review) #removing html tags
    review =  re.sub('[^A-Za-z0-9]+', ' ', review) #taking only words
  
  # lowercase
    review = review.lower()
  
  # tokenization
    tokens = nltk.word_tokenize(review) # converts review to tokens
  
  # stop_words removal
    review = [word for word in tokens if word not in stop_words] #removing stop words
  
  # lemmatization
    review = [lemmatizer.lemmatize(word) for word in review]
  
  # join words in preprocessed review
    review = ' '.join(review)
    return review

In [15]:
df['Text'] = df['Text'].apply(clean_urls).apply(clean_text).apply(emoji_cleaning).apply(clean_smileys).apply(decontracted)
#.apply(correct_spellings)

In [16]:
df['Text'] = df['Text'].apply(lambda review: data_preprocessing(review))

df

Unnamed: 0,Text
0,rt com guy funding fake story hunter biden
1,biden apparently told jtaps going ask everyone...
2,kingofgameplay heathmayo given chance blown tr...
3,conorjrogers reedgalen could not raise money b...
4,biden fire board member postal governor replace
...,...
17646,kthomasdc costareports nice hope biden think m...
17647,livingdead omg sensitive soul record massive o...
17648,bryceagen itsjefftiedrich realdonaldtrump coac...
17649,tomboliko resistor realdonaldtrump hillary che...


## Export cleaned data

In [17]:
process_file_name = 'biden_tweets_processed.csv'
  
df.to_csv(process_file_name)