# Data Creation, Simulation and Manipulation of Tweets and Emoticon data
- by Ian James H. Recto

we will be importing emoticon and tweets dataset which are available from the internet.

the tweet dataset we downloaded is a processed data that converted the emoticons into words and contains sentiment polarity, most of the available dataset with sentiment polarity are already processed.

We will be manipulating and simulating the tweet data set and add the emoticons while retaining the sentiment polarity.

This will ensure that the emoji icons will have the corresponding sentiment polarity we need


# Different Tweet Data for 2 Different Method of Emoticon Sentiment Analysis
- The tweetwer dataset above will be used for emotion sentiment analysis that the training of the text and emoticons are separated

- The tweeter dataset below will be used for emoticon sentiment analysis that includes the emoticons to the training

## Import pandas and numpy for data manipulatioin

In [1]:
import pandas as pd
import numpy as np

# Emoji data only emoticons
We wil be using an emoji dataset with sentiment polarity. You may download the file to this link: https://www.kaggle.com/thomasseleck/emoji-sentiment-data?select=Emoji_Sentiment_Data_v1.0.csv

In [2]:
df_emoji0 = pd.read_csv("dataset/Emoji_Sentiment_Data.csv")
df_emoticon0 = df_emoji0
pd.set_option('display.max_rows', df_emoticon0.shape[0]+1)
df_emoticon0

Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block
0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons
1,❤,0x2764,8050,0.746943,355,1334,6361,HEAVY BLACK HEART,Dingbats
2,♥,0x2665,7144,0.753806,252,1942,4950,BLACK HEART SUIT,Miscellaneous Symbols
3,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons
4,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons
5,😘,0x1f618,3648,0.85448,193,702,2753,FACE THROWING A KISS,Emoticons
6,😊,0x1f60a,3186,0.813302,189,754,2243,SMILING FACE WITH SMILING EYES,Emoticons
7,👌,0x1f44c,2925,0.805223,274,728,1923,OK HAND SIGN,Miscellaneous Symbols and Pictographs
8,💕,0x1f495,2400,0.765726,99,683,1618,TWO HEARTS,Miscellaneous Symbols and Pictographs
9,👏,0x1f44f,2336,0.78713,243,634,1459,CLAPPING HANDS SIGN,Miscellaneous Symbols and Pictographs


In [3]:
# Setup the data for emoji

df_emoji = pd.read_csv("dataset/Emoji_Sentiment_Data.csv")
df_emoji = df_emoji[df_emoji['Unicode block'] == 'Emoticons']
df_emoji.reset_index(inplace=True, drop=True)

df_emoticon = df_emoji
df_emoticon

Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block
0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons
1,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons
2,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons
3,😘,0x1f618,3648,0.85448,193,702,2753,FACE THROWING A KISS,Emoticons
4,😊,0x1f60a,3186,0.813302,189,754,2243,SMILING FACE WITH SMILING EYES,Emoticons
5,😁,0x1f601,2189,0.796151,278,648,1263,GRINNING FACE WITH SMILING EYES,Emoticons
6,😩,0x1f629,1808,0.826214,1069,336,403,WEARY FACE,Emoticons
7,🙏,0x1f64f,1539,0.793848,124,648,767,PERSON WITH FOLDED HANDS,Emoticons
8,😏,0x1f60f,1522,0.764977,170,676,676,SMIRKING FACE,Emoticons
9,😉,0x1f609,1521,0.844833,151,513,857,WINKING FACE,Emoticons


In [4]:
# compare the polarity of the dataset and turn the polarity to binary
# 0 = negative, 1= positive
polarity_ls = []
for index, row in df_emoticon.iterrows():
    
    # polarity == sentiment
    # initial polarity is negative
    polarity = 0 
    
    # positive if positive value is greater than negative value
    arg_1 = row['Positive'] > row['Negative']
    
    # positive if neutral value is odd and positive and negative value are equal
    arg_2 = row['Positive'] == row['Negative'] and row['Neutral'] % 2 != 0 
    
    # positive if either of the two arguments are true
    if arg_1 or arg_2:
        polarity = 1
    polarity_ls.append(polarity)
    
# create new emoji dataset
df_emoticon2 = pd.DataFrame(polarity_ls, columns=['sentiment'])
df_emoticon2['emoji'] = df_emoticon['Emoji'].values
df_emoticon2['name'] = df_emoticon['Unicode name'].values
df_emoticon2

Unnamed: 0,sentiment,emoji,name
0,1,😂,FACE WITH TEARS OF JOY
1,1,😍,SMILING FACE WITH HEART-SHAPED EYES
2,0,😭,LOUDLY CRYING FACE
3,1,😘,FACE THROWING A KISS
4,1,😊,SMILING FACE WITH SMILING EYES
5,1,😁,GRINNING FACE WITH SMILING EYES
6,0,😩,WEARY FACE
7,1,🙏,PERSON WITH FOLDED HANDS
8,1,😏,SMIRKING FACE
9,1,😉,WINKING FACE


## Create Separate Sentiment Polarity Emoticon Dataset

In [5]:
def create_senti_dataset(df, polarity):
    emoticon_df = df.loc[df['sentiment'] == polarity]
    df_emoticon_df = pd.DataFrame(emoticon_df)
    df_emoticon_df.reset_index(inplace=True, drop=True)
    return df_emoticon_df

In [6]:
df_emoticon_pos = create_senti_dataset(df_emoticon2, 1)
df_emoticon_pos

Unnamed: 0,sentiment,emoji,name
0,1,😂,FACE WITH TEARS OF JOY
1,1,😍,SMILING FACE WITH HEART-SHAPED EYES
2,1,😘,FACE THROWING A KISS
3,1,😊,SMILING FACE WITH SMILING EYES
4,1,😁,GRINNING FACE WITH SMILING EYES
5,1,🙏,PERSON WITH FOLDED HANDS
6,1,😏,SMIRKING FACE
7,1,😉,WINKING FACE
8,1,🙌,PERSON RAISING BOTH HANDS IN CELEBRATION
9,1,🙈,SEE-NO-EVIL MONKEY


In [7]:
df_emoticon_neg = create_senti_dataset(df_emoticon2, 0)
df_emoticon_neg

Unnamed: 0,sentiment,emoji,name
0,0,😭,LOUDLY CRYING FACE
1,0,😩,WEARY FACE
2,0,😒,UNAMUSED FACE
3,0,😔,PENSIVE FACE
4,0,😡,POUTING FACE
5,0,😴,SLEEPING FACE
6,0,😞,DISAPPOINTED FACE
7,0,😪,SLEEPY FACE
8,0,😫,TIRED FACE
9,0,😕,CONFUSED FACE


# Emoji data all

In [8]:
_cols = ['Emoji', 'Negative', 'Neutral', 'Positive', 'Unicode name']
df_emoji = pd.read_csv("dataset/Emoji_Sentiment_Data.csv", usecols=_cols)
df_emoji

Unnamed: 0,Emoji,Negative,Neutral,Positive,Unicode name
0,😂,3614,4163,6845,FACE WITH TEARS OF JOY
1,❤,355,1334,6361,HEAVY BLACK HEART
2,♥,252,1942,4950,BLACK HEART SUIT
3,😍,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES
4,😭,2412,1218,1896,LOUDLY CRYING FACE
5,😘,193,702,2753,FACE THROWING A KISS
6,😊,189,754,2243,SMILING FACE WITH SMILING EYES
7,👌,274,728,1923,OK HAND SIGN
8,💕,99,683,1618,TWO HEARTS
9,👏,243,634,1459,CLAPPING HANDS SIGN


In [9]:
# compare the polarity of the dataset and turn the polarity to binary
# 0 = negative, 1= positive
polarity_ls = []
for index, row in df_emoji.iterrows():
    
    # polarity == sentiment
    # initial polarity is negative
    polarity = 0 
    
    # positive if positive value is greater than negative value
    arg_1 = row['Positive'] > row['Negative']
    
    # positive if neutral value is odd and positive and negative value are equal
    arg_2 = row['Positive'] == row['Negative'] and row['Neutral'] % 2 != 0 
    
    # positive if either of the two arguments are true
    if arg_1 or arg_2:
        polarity = 1
    polarity_ls.append(polarity)
    
# create new emoji dataset
df_emoji2 = pd.DataFrame(polarity_ls, columns=['sentiment'])
df_emoji2['emoji'] = df_emoji['Emoji'].values
df_emoji2['name'] = df_emoji['Unicode name'].values
df_emoji2

Unnamed: 0,sentiment,emoji,name
0,1,😂,FACE WITH TEARS OF JOY
1,1,❤,HEAVY BLACK HEART
2,1,♥,BLACK HEART SUIT
3,1,😍,SMILING FACE WITH HEART-SHAPED EYES
4,0,😭,LOUDLY CRYING FACE
5,1,😘,FACE THROWING A KISS
6,1,😊,SMILING FACE WITH SMILING EYES
7,1,👌,OK HAND SIGN
8,1,💕,TWO HEARTS
9,1,👏,CLAPPING HANDS SIGN


# Method 1: Manipulating the text-only tweets dataset


I have provided two tweet dataset with a size of 1000 and 10k. 


You will notice that the index skips numbers. The provided data was proccessed and downsized from the original 1.6m tweet dataset which can be download on this link: https://www.kaggle.com/kazanova/sentiment140

In [10]:
# more data better sentiment analysis but bigger size
filename = 'dataset/10k_tweet_dataset_from_1_6m.csv' 
# filename = 'dataset/1000_tweet_dataset.csv'

# read the file
df_posts = pd.read_csv(filename)
df_posts = pd.DataFrame(df_posts)
df_posts

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5
0,0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,160,0,1467853356,Mon Apr 06 22:30:54 PDT 2009,NO_QUERY,dbmendel,Picked Mich St to win it all from the get go. ...
2,320,0,1467894600,Mon Apr 06 22:41:51 PDT 2009,NO_QUERY,dreaaa,throat is closing up and i had some string che...
3,480,0,1467932208,Mon Apr 06 22:52:25 PDT 2009,NO_QUERY,rachelgab,"If he doesn't get better in a few days, he cou..."
4,640,0,1467972262,Mon Apr 06 23:03:39 PDT 2009,NO_QUERY,Smith_Cameron,@hillary006 I'm sure everyone has ruined my gi...
...,...,...,...,...,...,...,...
9995,1599200,4,2193373009,Tue Jun 16 08:22:10 PDT 2009,NO_QUERY,xxYOitsALEXxx,http://twitpic.com/7ham4 - i know now what is ...
9996,1599360,4,2193427329,Tue Jun 16 08:26:39 PDT 2009,NO_QUERY,whipzilla,- had a great time with some of the best peopl...
9997,1599520,4,2193454592,Tue Jun 16 08:28:53 PDT 2009,NO_QUERY,FFang,"@Tyrese4ReaL Tyreseee, when you're heading to ..."
9998,1599680,4,2193503480,Tue Jun 16 08:32:48 PDT 2009,NO_QUERY,JConnell,@theokk don't know what you could possibly mea...


In [11]:
# we get the list of texts
text_posts = df_posts['5']
text_posts

0       @switchfoot http://twitpic.com/2y1zl - Awww, t...
1       Picked Mich St to win it all from the get go. ...
2       throat is closing up and i had some string che...
3       If he doesn't get better in a few days, he cou...
4       @hillary006 I'm sure everyone has ruined my gi...
                              ...                        
9995    http://twitpic.com/7ham4 - i know now what is ...
9996    - had a great time with some of the best peopl...
9997    @Tyrese4ReaL Tyreseee, when you're heading to ...
9998    @theokk don't know what you could possibly mea...
9999            What a pretty day  &quot;Just smile&quot;
Name: 5, Length: 10000, dtype: object

### Removing word and tags from the tweets
we will remove them since hyperlinks and tags don't add much to the sentiment of the post

In [12]:
temp = []
for text in text_posts:
    remove_keys = ('@', 'http://', '&', '#',)
    # remove words that starts with symbols from the remvoe keys
    clean_text = ' '.join(txt for txt in text.split() if not txt.startswith(remove_keys))
    temp.append(clean_text)
text_posts = temp
text_posts

["- Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",
 "Picked Mich St to win it all from the get go. Was feeling pretty good about that pick all the way up until......tonight. A's lost too",
 'throat is closing up and i had some string cheese. not a good idea',
 "If he doesn't get better in a few days, he could have something lodged in his belly",
 "I'm sure everyone has ruined my gift to you Whitney has my serious Cell doing easter as well?",
 'my little pinky finger hurts so much..',
 'you left without saying hi!',
 'everything alright?',
 'Yo jimo i cant talk on aim anymore, its glitching ill cya later and i hope u see this',
 "Didn't make it by here today. They are saying we will have snow tomorrow. WTF? It is Tennessee. It doesn't even snow here in winter.",
 "awww i'm sorry",
 "Goodnight everyone. Well I'm not feeling much better and I'm going to the doctor tomorrow.",
 'i thought i saw you there! you were walking out the door when i saw you.',
 'I wo

In [13]:
df_posts['0'] = df_posts['0'].replace(4, 1)
df_posts

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5
0,0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,160,0,1467853356,Mon Apr 06 22:30:54 PDT 2009,NO_QUERY,dbmendel,Picked Mich St to win it all from the get go. ...
2,320,0,1467894600,Mon Apr 06 22:41:51 PDT 2009,NO_QUERY,dreaaa,throat is closing up and i had some string che...
3,480,0,1467932208,Mon Apr 06 22:52:25 PDT 2009,NO_QUERY,rachelgab,"If he doesn't get better in a few days, he cou..."
4,640,0,1467972262,Mon Apr 06 23:03:39 PDT 2009,NO_QUERY,Smith_Cameron,@hillary006 I'm sure everyone has ruined my gi...
...,...,...,...,...,...,...,...
9995,1599200,1,2193373009,Tue Jun 16 08:22:10 PDT 2009,NO_QUERY,xxYOitsALEXxx,http://twitpic.com/7ham4 - i know now what is ...
9996,1599360,1,2193427329,Tue Jun 16 08:26:39 PDT 2009,NO_QUERY,whipzilla,- had a great time with some of the best peopl...
9997,1599520,1,2193454592,Tue Jun 16 08:28:53 PDT 2009,NO_QUERY,FFang,"@Tyrese4ReaL Tyreseee, when you're heading to ..."
9998,1599680,1,2193503480,Tue Jun 16 08:32:48 PDT 2009,NO_QUERY,JConnell,@theokk don't know what you could possibly mea...


In [14]:
tp_df = pd.DataFrame(columns=['sentiment', 'post'])
tp_df['post'] = text_posts
tp_df['sentiment'] = df_posts['0']

### Fill up empty post cells
some tweets only contain tags. some post rows might now contain empty cells, we will fill it up with '...'

In [15]:
tp_nan = tp_df[tp_df['post'] == ''].index
tp_nan

Int64Index([  49,  233,  505, 1062, 1138, 1861, 2156, 2736, 2787, 3418, 3549,
            3850, 4201, 4591, 5368, 5673, 5975, 5989, 7282, 7947, 9347, 9435],
           dtype='int64')

In [16]:
for i in tp_nan:
    tp_df['post'][i] = '...'
tp_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,sentiment,post
0,0,"- Awww, that's a bummer. You shoulda got David..."
1,0,Picked Mich St to win it all from the get go. ...
2,0,throat is closing up and i had some string che...
3,0,"If he doesn't get better in a few days, he cou..."
4,0,I'm sure everyone has ruined my gift to you Wh...
...,...,...
9995,1,- i know now what is that haha X)
9996,1,- had a great time with some of the best peopl...
9997,1,"Tyreseee, when you're heading to The Netherlan..."
9998,1,"don't know what you could possibly mean, dear ..."


In [17]:
tp_df.to_csv('dataset/processed_tweet_dataset.csv')

In [18]:
tp_df

Unnamed: 0,sentiment,post
0,0,"- Awww, that's a bummer. You shoulda got David..."
1,0,Picked Mich St to win it all from the get go. ...
2,0,throat is closing up and i had some string che...
3,0,"If he doesn't get better in a few days, he cou..."
4,0,I'm sure everyone has ruined my gift to you Wh...
...,...,...
9995,1,- i know now what is that haha X)
9996,1,- had a great time with some of the best peopl...
9997,1,"Tyreseee, when you're heading to The Netherlan..."
9998,1,"don't know what you could possibly mean, dear ..."


# Method 2: Creating new data and randomizing emoticons

the tweet data that we will use will be from the following link
https://www.kaggle.com/shashank1558/preprocessed-twitter-tweets

In [19]:
csv_twt_pos = pd.read_csv("dataset/raw_data_tweets/processedPositive.csv")
csv_twt_neg = pd.read_csv("dataset/raw_data_tweets/processedNegative.csv")

In [20]:
csv_twt_pos

Unnamed: 0,An inspiration in all aspects: Fashion,fitness,beauty and personality. :) :* TheFashionIcon,Apka Apna Awam Ka Channel Frankline Tv Aam Admi Production Please Visit Or Likes Share :) Fb Page :...,Beautiful album from the greatest unsung guitar genius of our time - and I've met the great backstage,Good luck to Rich riding for great project in this Sunday. Can you donate?,Omg he... kissed... him crying with joy,happy anniv ming and papi!!!!! love love happy,thanks happy,C'mon Tweeps,...,$ES_F $SPY Bulls are just relentless happy Some setups I like today,Thanks for the recent follow Happy to connect happy have a great Thursday..33,Thanks for the recent follow Happy to connect happy have a great Thursday. Want this.5,Thanks for the recent follow Happy to connect happy have a great Thursday. Want this.7,you know that the problem still exist :D,Thanks for the recent follow Happy to connect happy have a great Thursday. Get this.1,#NAME?.1,ngam to weeks left for cadet pilot exam crying with joy,Great! You're welcome Josh happy ^Adam,Sixth spot not applicable Team! Higher pa! :) :* TheFashionIcon.1


In [21]:
csv_twt_neg

Unnamed: 0,How unhappy some dogs like it though,talking to my over driver about where I'm goinghe said he'd love to go to New York too but since Trump it's probably not,Does anybody know if the Rand's likely to fall against the dollar? I got some money I need to change into R but it keeps getting stronger unhappy,I miss going to gigs in Liverpool unhappy,There isnt a new Riverdale tonight ? unhappy,it's that A*dy guy from pop Asia and then the translator so they'll probs go with them around Aus unhappy,Who's that chair you're sitting in? Is this how I find out. Everyone knows now. You've shamed me in pu,don't like how jittery caffeine makes me sad,My area's not on the list unhappy think I'll go LibDems anyway,I want fun plans this weekend unhappy,...,and yet if parents invest in child's emotional education by taking child out of school on holiday early that's un,YG should have sent them to MCD. I want to see them holding the trophy unhappy anyways .9,i want more orientation unhappy,unhappy they not,YG should have sent them to MCD. I want to see them holding the trophy unhappy anyways .10,wish knock out lang talaga for the new school year are good and cooperative groupmates please unhappy,i miss so much unhappy,Same unhappy .1,Hi instant message your friend friend lang,hindi close friend? unhappy


### Data Cleaning for the tweet dataset
the dataset is listed by column.
We will list in by row and assign a sentiment polarity value (1 & 0, positive and negative respectively)

In [22]:
def get_tweet_polarity(df, polarity):
    dpos = {'Text': list(df.columns.values), 'Sentiment': polarity}
    df_twt = pd.DataFrame(data = dpos).iloc[:1000]
    return df_twt

In [23]:
df_twt_pos = get_tweet_polarity(csv_twt_pos, 1)
df_twt_pos

Unnamed: 0,Text,Sentiment
0,An inspiration in all aspects: Fashion,1
1,fitness,1
2,beauty and personality. :) :* TheFashionIcon,1
3,Apka Apna Awam Ka Channel Frankline Tv Aam Adm...,1
4,Beautiful album from the greatest unsung guit...,1
...,...,...
995,big love for Anne cox happy,1
996,Just a new happy,1
997,Hellooo happy Jackatkinson (jackat13),1
998,thanks for the recent follow. Much appreciated...,1


In [24]:
df_twt_neg = get_tweet_polarity(csv_twt_neg, 0)
df_twt_neg

Unnamed: 0,Text,Sentiment
0,How unhappy some dogs like it though,0
1,talking to my over driver about where I'm goin...,0
2,Does anybody know if the Rand's likely to fall...,0
3,I miss going to gigs in Liverpool unhappy,0
4,There isnt a new Riverdale tonight ? unhappy,0
...,...,...
995,I'm just really soft spoken unhappy,0
996,PLAYMFS: need moodbooster juseyo unhappy jily...,0
997,Koalas are dying of thirst and it's all becau...,0
998,pake 3 unhappy,0


In [25]:
df_emoticon_pos

Unnamed: 0,sentiment,emoji,name
0,1,😂,FACE WITH TEARS OF JOY
1,1,😍,SMILING FACE WITH HEART-SHAPED EYES
2,1,😘,FACE THROWING A KISS
3,1,😊,SMILING FACE WITH SMILING EYES
4,1,😁,GRINNING FACE WITH SMILING EYES
5,1,🙏,PERSON WITH FOLDED HANDS
6,1,😏,SMIRKING FACE
7,1,😉,WINKING FACE
8,1,🙌,PERSON RAISING BOTH HANDS IN CELEBRATION
9,1,🙈,SEE-NO-EVIL MONKEY


In [26]:
df_emoticon_neg

Unnamed: 0,sentiment,emoji,name
0,0,😭,LOUDLY CRYING FACE
1,0,😩,WEARY FACE
2,0,😒,UNAMUSED FACE
3,0,😔,PENSIVE FACE
4,0,😡,POUTING FACE
5,0,😴,SLEEPING FACE
6,0,😞,DISAPPOINTED FACE
7,0,😪,SLEEPY FACE
8,0,😫,TIRED FACE
9,0,😕,CONFUSED FACE


### Convert the ASCII emoji into Emoticon symbols

In [27]:
# corresponding emoticon sysmbols
txt_emoji = [
    ':)', ':P', ':D', ':|', ":'(", ':O', ":*", '<3', ':(', ';)',
    'xD', ':/', '=D'
]
txt_emoji_pic =[
    '😊', '😛', '😄', '😐', '😢', '😲', '😘', '😍', '😧', '😉', 
    '😁', '😒', '😀'
]

### Simulation of emoticons
here we will find each text based emojis and convert them into utf-8 emoticon symbols.
We will also add emoticons on half of the tweets to increase the sentiment value of the utf-8 emoticons

In [28]:
def emoji_conv(txt, conv_txt, conv_pic):
    temp = []
    for i in txt:
        for j in range(len(conv_txt)):
            if i == conv_txt[j]:
                i = conv_pic[j]
        temp.append(i)
    return ' '.join(temp)
    

In [29]:
def conv_emoticon_on_data(df_data):
    conv_text = []
    for idx, row in df_data.iterrows():
        txt = [i for i in row['Text'].split()]
        emoji_found = emoji_conv(txt, txt_emoji, txt_emoji_pic)
        conv_text.append(emoji_found)
    return conv_text

In [30]:
pos_conv_text = conv_emoticon_on_data(df_twt_pos)
pos_conv_text

['An inspiration in all aspects: Fashion',
 'fitness',
 'beauty and personality. 😊 😘 TheFashionIcon',
 'Apka Apna Awam Ka Channel Frankline Tv Aam Admi Production Please Visit Or Likes Share 😊 Fb Page :...',
 "Beautiful album from the greatest unsung guitar genius of our time - and I've met the great backstage",
 'Good luck to Rich riding for great project in this Sunday. Can you donate?',
 'Omg he... kissed... him crying with joy',
 'happy anniv ming and papi!!!!! love love happy',
 'thanks happy',
 "C'mon Tweeps",
 'Join vote for the singer! Do spread the word. 😄',
 'Thanks for the great review! smile',
 'Yay another art raffle! Everything you need to know is in the picture 😄',
 'Hello I hope you visit Luxor its amazing city in Egypt pleas check',
 'We got a Vive tracker in the office and our intern',
 "went to work.Don't get too excited",
 "this isn't",
 'Take a look at favourites.io You can do this and more happy',
 'Go back to school for music! I think I will in time happy',
 'Six

In [31]:
neg_conv_text = conv_emoticon_on_data(df_twt_neg)
neg_conv_text

['How unhappy some dogs like it though',
 "talking to my over driver about where I'm goinghe said he'd love to go to New York too but since Trump it's probably not",
 "Does anybody know if the Rand's likely to fall against the dollar? I got some money I need to change into R but it keeps getting stronger unhappy",
 'I miss going to gigs in Liverpool unhappy',
 'There isnt a new Riverdale tonight ? unhappy',
 "it's that A*dy guy from pop Asia and then the translator so they'll probs go with them around Aus unhappy",
 "Who's that chair you're sitting in? Is this how I find out. Everyone knows now. You've shamed me in pu",
 "don't like how jittery caffeine makes me sad",
 "My area's not on the list unhappy think I'll go LibDems anyway",
 'I want fun plans this weekend unhappy',
 'When can you notice me. unhappy what?',
 'Ahhhhh! You recognized LOGAN!!! Cinemax shows have a BAD track record for getting cancelled unhappy',
 "Errr dude.... They're gone unhappy Asked other league memeber to c

### Let's add emoticons for tweets containing the words below

In [32]:
# for additional inputs
add_emoji_txt = ['sad', 'unhappy', 'crying', 'smile', 'happy', 'love']
add_emoji_pic =['😔', '😧', '😆', '😭', '😊', '😍']

In [33]:
def add_emoji_text_data(df_data):
    reform_pos_text = []
    for ct in df_data:
        txt = [i for i in ct.split()]
        emoji_found = emoji_conv(txt, add_emoji_txt, add_emoji_pic)
        reform_pos_text.append(emoji_found)
    return reform_pos_text
                

In [34]:
pos_conv_text = add_emoji_text_data(pos_conv_text)
pos_conv_text

['An inspiration in all aspects: Fashion',
 'fitness',
 'beauty and personality. 😊 😘 TheFashionIcon',
 'Apka Apna Awam Ka Channel Frankline Tv Aam Admi Production Please Visit Or Likes Share 😊 Fb Page :...',
 "Beautiful album from the greatest unsung guitar genius of our time - and I've met the great backstage",
 'Good luck to Rich riding for great project in this Sunday. Can you donate?',
 'Omg he... kissed... him 😆 with joy',
 '😊 anniv ming and papi!!!!! 😍 😍 😊',
 'thanks 😊',
 "C'mon Tweeps",
 'Join vote for the singer! Do spread the word. 😄',
 'Thanks for the great review! 😭',
 'Yay another art raffle! Everything you need to know is in the picture 😄',
 'Hello I hope you visit Luxor its amazing city in Egypt pleas check',
 'We got a Vive tracker in the office and our intern',
 "went to work.Don't get too excited",
 "this isn't",
 'Take a look at favourites.io You can do this and more 😊',
 'Go back to school for music! I think I will in time 😊',
 'Sixth spot not applicable Team! Higher

In [35]:
neg_conv_text = add_emoji_text_data(neg_conv_text)
neg_conv_text

['How 😧 some dogs like it though',
 "talking to my over driver about where I'm goinghe said he'd 😍 to go to New York too but since Trump it's probably not",
 "Does anybody know if the Rand's likely to fall against the dollar? I got some money I need to change into R but it keeps getting stronger 😧",
 'I miss going to gigs in Liverpool 😧',
 'There isnt a new Riverdale tonight ? 😧',
 "it's that A*dy guy from pop Asia and then the translator so they'll probs go with them around Aus 😧",
 "Who's that chair you're sitting in? Is this how I find out. Everyone knows now. You've shamed me in pu",
 "don't like how jittery caffeine makes me 😔",
 "My area's not on the list 😧 think I'll go LibDems anyway",
 'I want fun plans this weekend 😧',
 'When can you notice me. 😧 what?',
 'Ahhhhh! You recognized LOGAN!!! Cinemax shows have a BAD track record for getting cancelled 😧',
 "Errr dude.... They're gone 😧 Asked other league memeber to check the guys are go",
 'Not you again 😔',
 'Why would Harvey be 

# Create the dataset of the processed emoji and tweet dataset

In [36]:
def new_df_emoji_tweet(data, polarity):
    temp = pd.DataFrame(columns=['sentiment', 'post'])
    temp['post'] = data
    temp['sentiment'] = polarity
    return temp

In [37]:
df_pos_tweets = new_df_emoji_tweet(pos_conv_text, 1)
df_pos_tweets

Unnamed: 0,sentiment,post
0,1,An inspiration in all aspects: Fashion
1,1,fitness
2,1,beauty and personality. 😊 😘 TheFashionIcon
3,1,Apka Apna Awam Ka Channel Frankline Tv Aam Adm...
4,1,Beautiful album from the greatest unsung guita...
...,...,...
995,1,big 😍 for Anne cox 😊
996,1,Just a new 😊
997,1,Hellooo 😊 Jackatkinson (jackat13)
998,1,thanks for the recent follow. Much appreciated...


In [38]:
df_neg_tweets = new_df_emoji_tweet(neg_conv_text, 0)
df_neg_tweets

Unnamed: 0,sentiment,post
0,0,How 😧 some dogs like it though
1,0,talking to my over driver about where I'm goin...
2,0,Does anybody know if the Rand's likely to fall...
3,0,I miss going to gigs in Liverpool 😧
4,0,There isnt a new Riverdale tonight ? 😧
...,...,...
995,0,I'm just really soft spoken 😧
996,0,PLAYMFS: need moodbooster juseyo 😧 jily_jelly
997,0,Koalas are dying of thirst and it's all becaus...
998,0,pake 3 😧


In [39]:
df_neg_tweets.to_csv("dataset/1k_data_tweets_emoticon_neg.csv")
df_pos_tweets.to_csv("dataset/1k_data_tweets_emoticon_pos.csv")

### check how many tweets do contain utf-8 emoticon

In [40]:
txt_emoji_pic =[
    '😊', '😛', '😄', '😐', '😢', '😲', '😘', '😍', '😧', '😉', 
    '😁', '😒', '😀', '😔', '😧', '😆', '😭'
]

def emoji_checker(data_text):
    has_emoji = False
    for i in txt_emoji_pic:
        if i in data_text:
            has_emoji = True
    return has_emoji
    
def post_emoji_counter(df_tweets):
    c = 0
    for idx, row in df_tweets.iterrows():
        if emoji_checker(row['post']):
            c += 1
    return f'{c} / 1000'

In [41]:
print(post_emoji_counter(df_neg_tweets))
print(post_emoji_counter(df_pos_tweets))
df_neg_tweets2 = pd.DataFrame(df_neg_tweets)
df_pos_tweets2 = pd.DataFrame(df_pos_tweets)

797 / 1000
696 / 1000


## Trim each data to 500
we will trim the data to 500, and balance the numbers of tweets with and without emoticons
then we will merge the 500 size data of positive and negative to create a dataset containing 1000 data with balanced number of polarity and emoticon ratio

In [42]:
def idx_with_emoji(df_tweets):
    idx_wemo = []
    c = 500
    for idx, row in df_tweets.iterrows():
        has_emoji = emoji_checker(row['post'])
        if has_emoji: idx_wemo.append(idx)  
        if has_emoji and c > 0 : c -= 1
        if c == 0 : break 
    return idx_wemo

In [43]:
pos_idxs = idx_with_emoji(df_pos_tweets2)
neg_idxs = idx_with_emoji(df_neg_tweets2)
print(pos_idxs)

[2, 3, 6, 7, 8, 10, 11, 12, 17, 18, 19, 20, 21, 23, 24, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 43, 44, 47, 49, 50, 52, 54, 57, 58, 59, 61, 62, 63, 64, 65, 66, 68, 69, 71, 72, 73, 75, 76, 78, 79, 81, 83, 84, 85, 87, 90, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 104, 105, 107, 109, 110, 111, 114, 115, 116, 118, 119, 124, 125, 127, 130, 132, 133, 134, 135, 136, 137, 138, 140, 141, 142, 143, 144, 147, 149, 150, 151, 152, 153, 155, 156, 157, 158, 160, 161, 162, 163, 164, 165, 168, 169, 172, 173, 174, 175, 176, 177, 178, 180, 182, 184, 185, 186, 187, 191, 193, 195, 196, 197, 198, 199, 201, 202, 203, 204, 205, 208, 210, 211, 213, 214, 215, 218, 219, 220, 221, 222, 224, 225, 226, 231, 232, 234, 235, 236, 238, 239, 242, 243, 244, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 258, 261, 262, 263, 265, 266, 267, 268, 269, 270, 271, 274, 275, 276, 277, 279, 280, 281, 283, 284, 285, 286, 287, 288, 289, 290, 293, 295, 296, 297, 299, 301, 302, 303, 305, 310, 312, 313, 314

In [44]:
df_pos_500 = pd.DataFrame(df_pos_tweets2)
df_neg_500 = pd.DataFrame(df_neg_tweets2)

In [45]:
df_pos_500.drop(df_pos_500.index[pos_idxs], inplace=True)
df_pos_500.reset_index(inplace=True, drop=True)
df_pos_500

Unnamed: 0,sentiment,post
0,1,An inspiration in all aspects: Fashion
1,1,fitness
2,1,Beautiful album from the greatest unsung guita...
3,1,Good luck to Rich riding for great project in ...
4,1,C'mon Tweeps
5,1,Hello I hope you visit Luxor its amazing city ...
6,1,We got a Vive tracker in the office and our in...
7,1,went to work.Don't get too excited
8,1,this isn't
9,1,retweet this tweet if you want your to be incl...


In [46]:
df_neg_500.drop(df_neg_500.index[neg_idxs], inplace=True)
df_neg_500.reset_index(inplace=True, drop=True)
df_neg_500

Unnamed: 0,sentiment,post
0,0,Who's that chair you're sitting in? Is this ho...
1,0,Becoz if we will depend on your promoting its ...
2,0,For those asking
3,0,the application is Kana Kanji Funtime! Sadly
4,0,Yeah did update to 16.04
5,0,it froze a few times. Then went to 16.10
6,0,Shaandaar
7,0,Zabardast
8,0,Ah alright
9,0,i don%27t know if you saw my comment yet


### Append and shuffle the positive and negative dataset

In [47]:
from sklearn.utils import shuffle
df_tweet_1000 = df_pos_500.append(df_neg_500)
df_tweet_1000 = shuffle(df_tweet_1000)
df_tweet_1000.reset_index(inplace=True, drop=True)

In [48]:
df_tweet_1000

Unnamed: 0,sentiment,post
0,0,One year ago today 😧 .1
1,1,keep smiling happy.1
2,0,It's hard to imagine anyone but Robin 😧 but st...
3,1,Good luck to Rich riding for great project in ...
4,1,He didn't play for a year
...,...,...
995,0,Maa ki kuss tumhari. Now take this bullshit yo...
996,0,Pozuelo (formerly of Swans) and Suso (Liverpoo...
997,0,Louis_Tomlinson follow me please? 😧
998,1,you know what i think? you look exceptional fo...


# Exporting

In [49]:
df_tweet_1000.to_csv('dataset/1k_data_emoji_tweets_senti_posneg.csv')

In [50]:
df_test = pd.read_csv('dataset/1k_data_emoji_tweets_senti_posneg.csv')
df_test

Unnamed: 0.1,Unnamed: 0,sentiment,post
0,0,0,One year ago today 😧 .1
1,1,1,keep smiling happy.1
2,2,0,It's hard to imagine anyone but Robin 😧 but st...
3,3,1,Good luck to Rich riding for great project in ...
4,4,1,He didn't play for a year
...,...,...,...
995,995,0,Maa ki kuss tumhari. Now take this bullshit yo...
996,996,0,Pozuelo (formerly of Swans) and Suso (Liverpoo...
997,997,0,Louis_Tomlinson follow me please? 😧
998,998,1,you know what i think? you look exceptional fo...


## small data for emoticons

In [51]:
df_emoticon

Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block
0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons
1,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons
2,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons
3,😘,0x1f618,3648,0.85448,193,702,2753,FACE THROWING A KISS,Emoticons
4,😊,0x1f60a,3186,0.813302,189,754,2243,SMILING FACE WITH SMILING EYES,Emoticons
5,😁,0x1f601,2189,0.796151,278,648,1263,GRINNING FACE WITH SMILING EYES,Emoticons
6,😩,0x1f629,1808,0.826214,1069,336,403,WEARY FACE,Emoticons
7,🙏,0x1f64f,1539,0.793848,124,648,767,PERSON WITH FOLDED HANDS,Emoticons
8,😏,0x1f60f,1522,0.764977,170,676,676,SMIRKING FACE,Emoticons
9,😉,0x1f609,1521,0.844833,151,513,857,WINKING FACE,Emoticons


In [52]:
txt_emoji_pic =[
    '😊', '😛', '😄', '😐', '😢', '😲', '😘', '😍', '😧', '😉', 
    '😁', '😒', '😀', '😔', '😧', '😆', '😭'
]
df_emo_ls = df_emoticon[df_emoticon['Emoji'].isin(txt_emoji_pic)]
df_emo_ls = df_emo_ls.drop(columns=['Occurrences', 'Position', 'Negative', 'Neutral', 'Positive', 'Unicode block'])
df_emo_ls.reset_index(inplace=True, drop=True)
df_emo_ls

Unnamed: 0,Emoji,Unicode codepoint,Unicode name
0,😍,0x1f60d,SMILING FACE WITH HEART-SHAPED EYES
1,😭,0x1f62d,LOUDLY CRYING FACE
2,😘,0x1f618,FACE THROWING A KISS
3,😊,0x1f60a,SMILING FACE WITH SMILING EYES
4,😁,0x1f601,GRINNING FACE WITH SMILING EYES
5,😉,0x1f609,WINKING FACE
6,😄,0x1f604,SMILING FACE WITH OPEN MOUTH AND SMILING EYES
7,😒,0x1f612,UNAMUSED FACE
8,😔,0x1f614,PENSIVE FACE
9,😢,0x1f622,CRYING FACE


In [53]:
df_emo_ls.to_csv('dataset/15_emoticon_data.csv')

# Dataset Usage
we now have our processed dataset suitable for emoticon sentiment analysis