## Data - Preprocessing

In [160]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [161]:
# Dataset downloaded from https://www.kaggle.com/hariharasudhanas/twitter-emoji-prediction
df = pd.read_csv('datasets/Train.csv', index_col=None)
df = df.drop(df.columns[0],axis=1)
df.columns = ['text','emoji']
df_mapping = pd.read_csv('datasets/Mapping.csv',index_col=None)
df_mapping = df_mapping.drop(df_mapping.columns[0],axis=1)
df_mapping.columns = ['emoji','number']

In [162]:
df.head()

Unnamed: 0,text,emoji
0,Vacation wasted ! #vacation2017 #photobomb #ti...,0
1,"Oh Wynwood, you’re so funny! : @user #Wynwood ...",1
2,Been friends since 7th grade. Look at us now w...,2
3,This is what it looks like when someone loves ...,3
4,RT @user this white family was invited to a Bl...,3


In [163]:
df_mapping.head()

Unnamed: 0,emoji,number
0,😜,0
1,📸,1
2,😍,2
3,😂,3
4,😉,4


In [164]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    70000 non-null  object
 1   emoji   70000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


### Checking null values

In [165]:
df.isnull().sum()

text     0
emoji    0
dtype: int64

### Removing duplicates


In [166]:
# Select duplicate rows except first occurrence based on all columns
duplicates = df[df.duplicated()]
print("Duplicate rows:")
print(duplicates)

Duplicate Rows except first occurrence based on all columns are :
                                                    text  emoji
2697   WE WILL BE CLOSED ON THE 4th OF JULY. And it's...     11
3857   ... COME OUT AND PARTY WITH US at membersonlye...     16
9163                        ️ @ Disney's Magic Kingdom\n      9
11110                    ️ @ Epcot - Walt Disney World\n      9
12357                    ️ @ Epcot - Walt Disney World\n      9
...                                                  ...    ...
67887                         @ Disney's Magic Kingdom\n     13
68057                         @ Disney's Magic Kingdom\n     13
68938                    @ Times Square, New York City\n      3
69430                    @ Arlington National Cemetery\n     11
69465                             @ New York, New York\n      8

[69 rows x 2 columns]


In [167]:
df = df.drop_duplicates(subset = ['text'],keep='first')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69832 entries, 0 to 69999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    69832 non-null  object
 1   emoji   69832 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.6+ MB


In [142]:
counts = df.emoji.value_counts().rename_axis('number').reset_index(name='counts')
emojis = df_mapping.emoji[counts.number]
counts = counts.assign(emoji = emojis.values)
print(counts)

    number  counts emoji
0        9   15067     ❤
1        2    7053     😍
2        3    6835     😂
3        7    4357     🔥
4       15    3835     💕
5       13    3235     ✨
6       16    2817     😎
7       17    2743     😊
8        1    2588     📸
9       11    2419    🇺🇸
10      14    2336     💙
11       6    2081     📷
12       8    1985     😘
13       5    1971     🎄
14      12    1905     ☀
15       4    1874     😉
16      19    1740     💯
17      18    1720     💜
18      10    1716     😁
19       0    1555     😜


Remove 14,8,18<br>
9,15,2 ==> 9<br>
1,6 ==> 6<br>



### Merging similar emojis together

In [143]:
df = df.drop(df[(df.emoji).isin([18,14,8])].index)
print(set(df.emoji))

{0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 15, 16, 17, 19}


In [144]:
df.loc[df.emoji.isin([9,2,15]), 'emoji']= 9 
df.loc[df.emoji.isin([1,6]), 'emoji']= 6 


In [145]:
print((set(df.emoji)))


{0, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 16, 17, 19}


In [146]:
counts = df.emoji.value_counts().rename_axis('number').reset_index(name='counts')
emojis = df_mapping.emoji[counts.number]
counts = counts.assign(emoji = emojis.values)
print(counts)

    number  counts emoji
0        9   25955     ❤
1        3    6835     😂
2        6    4669     📷
3        7    4357     🔥
4       13    3235     ✨
5       16    2817     😎
6       17    2743     😊
7       11    2419    🇺🇸
8        5    1971     🎄
9       12    1905     ☀
10       4    1874     😉
11      19    1740     💯
12      10    1716     😁
13       0    1555     😜


In [147]:
new_mapping = dict(zip(counts.number,counts.index))

print(new_mapping)


{9: 0, 3: 1, 6: 2, 7: 3, 13: 4, 16: 5, 17: 6, 11: 7, 5: 8, 12: 9, 4: 10, 19: 11, 10: 12, 0: 13}


In [148]:
new_df = df.replace({'emoji':new_mapping})
print((set(new_df.emoji)))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}


In [None]:
new_mapping = counts.assign(index = counts.index)
new_mapping = new_mapping.drop(columns=['number'])
new_mapping = new_mapping.rename(columns={'index': 'number'})
print(new_mapping)
new_mapping.to_csv('datasets/new_mapping.csv',index=False)
new_df.to_csv('datasets/new_data.csv',index=False)

### Removing short tweets

In [150]:
count = new_df['text'].str.split().str.len()
new_df = new_df[count>5]

In [151]:
counts = new_df.emoji.value_counts().rename_axis('number').reset_index(name='counts')
emojis = new_mapping.emoji[counts.number]
counts = counts.assign(emoji = emojis.values)
print(counts)

    number  counts emoji
0        0   24554     ❤
1        1    6319     😂
2        2    4532     📷
3        3    4097     🔥
4        4    3049     ✨
5        5    2622     😎
6        6    2573     😊
7        7    2247    🇺🇸
8        8    1867     🎄
9       10    1811     😉
10       9    1785     ☀
11      12    1616     😁
12      11    1558     💯
13      13    1498     😜


In [152]:
print(new_df.shape)
counts.to_csv('datasets/cleaned_mapping.csv',index=False)
new_df.to_csv('datasets/cleaned_data.csv',index=False)

(60128, 2)


### Generating balanced Datasest

In [153]:
print(new_df.shape)
balanced_df = new_df
print(balanced_df.shape)
balanced_df[balanced_df.emoji==tup[0]].shape

(60128, 2)
(60128, 2)


(24554, 2)

In [154]:
desired_rows = 10000

for tup in zip(counts.number,counts.counts):
    if(tup[1]>desired_rows):
        delete_rows = tup[1] - desired_rows
        print(tup[1],delete_rows)
        balanced_df.drop(balanced_df[balanced_df.emoji==tup[0]].sample(n=delete_rows).index,inplace=True)

24554 14554


In [155]:
counts = balanced_df.emoji.value_counts(sort=False).rename_axis('number').reset_index(name='counts')
emojis = new_mapping.emoji[counts.number]
counts = counts.assign(emoji = emojis.values)
print(counts)

    number  counts emoji
0        0   10000     ❤
1        1    6319     😂
2        2    4532     📷
3        3    4097     🔥
4        4    3049     ✨
5        5    2622     😎
6        6    2573     😊
7        7    2247    🇺🇸
8        8    1867     🎄
9        9    1785     ☀
10      10    1811     😉
11      11    1558     💯
12      12    1616     😁
13      13    1498     😜


In [156]:
counts.to_csv('datasets/balanced_mapping.csv',index=False)
new_df.to_csv('datasets/balanced_data.csv',index=False)
print(new_df.shape)

### Generating Toy Dataset

In [157]:
toy_df = new_df
toy_mapping = counts
desired_rows = 100
for tup in zip(toy_mapping.number,toy_mapping.counts):
    if(tup[1]>desired_rows):
        delete_rows = tup[1] - desired_rows
        toy_df.drop(toy_df[toy_df.emoji==tup[0]].sample(n=delete_rows).index,inplace=True)

In [158]:
toy_counts = toy_df.emoji.value_counts(sort=False).rename_axis('number').reset_index(name='counts')
emojis = toy_mapping.emoji[toy_counts.number]
toy_counts = toy_counts.assign(emoji = emojis.values)
print(toy_counts)

    number  counts emoji
0        0     100     ❤
1        1     100     😂
2        2     100     📷
3        3     100     🔥
4        4     100     ✨
5        5     100     😎
6        6     100     😊
7        7     100    🇺🇸
8        8     100     🎄
9        9     100     ☀
10      10     100     😉
11      11     100     💯
12      12     100     😁
13      13     100     😜


In [159]:
toy_counts.to_csv('datasets/toy_mapping.csv',index=False)
toy_df.to_csv('datasets/toy_data.csv',index=False)