# NLP Tutorial - Level 1

Natural Language Processing, or NLP for short, is broadly defined as the automatic manipulation of natural language, like speech and text, by software.


In [312]:
print("Here we import the packages we are going to need to do the analysis")
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt

Here we import the packages we are going to need to do the analysis


## The data

In [314]:
#This command imports the dataset
df = pd.read_csv("dataset_sentiment/Tweets_level_1.csv")

print("Dataset has been imported as df")

print("Let's have a look at the first 10 rows")

df.head(10)

Dataset has been imported as df
Let's have a look at the first 10 rows


Unnamed: 0.1,Unnamed: 0,tweet_id,airline_sentiment,text
0,3,570301031407624196,negative,@VirginAmerica it's really aggressive to blast...
1,4,570300817074462722,negative,@VirginAmerica and it's a really big bad thing...
2,5,570300767074181121,negative,@VirginAmerica seriously would pay $30 a fligh...
3,9,570295459631263746,positive,"@VirginAmerica it was amazing, and arrived an ..."
4,11,570289724453216256,positive,@VirginAmerica I &lt;3 pretty graphics. so muc...
5,12,570289584061480960,positive,@VirginAmerica This is such a great deal! Alre...
6,14,570285904809598977,positive,@VirginAmerica Thanks!
7,16,570277724385734656,positive,@VirginAmerica So excited for my first cross c...
8,17,570276917301137409,negative,@VirginAmerica I flew from NYC to SFO last we...
9,18,570270684619923457,positive,I ❤️ flying @VirginAmerica. 👍 ☺️


**We see that each row is a unique Tweet and each column gives us some specific information about the tweet**

In [315]:
#The command .shape gets two numbers: the number of rows and the number of columns
a,b = df.shape
print("The dataframe has " + str(a) +" rows and " + str(b) + " columns.")

print("Which also means we have " + str(a) +" Tweets and " + str(b) + " pieces of info about each tweet.")

The dataframe has 9134 rows and 4 columns.
Which also means we have 9134 Tweets and 4 pieces of info about each tweet.


In [None]:
#The command .columns gets the columns of the dataset

columns = df.columns

print("The columns of the dataset are: \n")
for i,k in enumerate(columns): print(i+1, k)

### What's each column for?

We can go to the page where we got the dataset from, and we will find a description of each column: https://www.kaggle.com/crowdflower/twitter-airline-sentiment

1. **tweet_id** The ID of the tweet
2. **airline_sentiment** The sentiment of the tweet
3. **text** The text of the Tweet


# Let's look at the Tweets


In [310]:
n_tweets = len(df['text'])
print("There is a total of " + str(n_tweets) + " Tweets.")

There is a total of 9134 Tweets.


In [311]:
for i,k in enumerate(df['text'][0:10]): print("Tweet " + str(i+1) + ":",  k, "(" + str(df['airline_sentiment'].to_list()[i]) + ")\n")

Tweet 1: @VirginAmerica it's really aggressive to blast obnoxious "entertainment" in your guests' faces &amp; they have little recourse (negative)

Tweet 2: @VirginAmerica and it's a really big bad thing about it (negative)

Tweet 3: @VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.
it's really the only bad thing about flying VA (negative)

Tweet 4: @VirginAmerica it was amazing, and arrived an hour early. You're too good to me. (positive)

Tweet 5: @VirginAmerica I &lt;3 pretty graphics. so much better than minimal iconography. :D (positive)

Tweet 6: @VirginAmerica This is such a great deal! Already thinking about my 2nd trip to @Australia &amp; I haven't even gone on my 1st trip yet! ;p (positive)

Tweet 7: @VirginAmerica Thanks! (positive)

Tweet 8: @VirginAmerica So excited for my first cross country flight LAX to MCO I've heard nothing but great things about Virgin America. #29DaysToGo (positive)

Tweet 9: @VirginAmerica  I flew from NYC to 

# Let's start the cleaning!

# Basic cleaning

In [None]:
import re
import string
import emoji

tweets = df['text'].to_list()

def basic_cleaning(tweet):

    #This makes text lowercase
    tweet = tweet.lower()
    #This removes hashtag
    tweet = tweet.replace("#","")
    #This removes any sequence of characters that starts with "@" i.e. usernames
    tweet = re.sub('@[^\s]+','',tweet)
    #Remove any HTML special symbols such as &amp;
    tweet = re.sub('(?:\s)&[^, ]*', '', tweet)

    #We remove digits
    tweet = ''.join([i for i in tweet if not i.isdigit()])
    
    #We remove digits
    tweet = ''.join([i for i in tweet if not i.isdigit()])
    
    #And now we can finally remove punctuation
    for c in string.punctuation:
        tweet = tweet.replace(c,"")
        
    tweet = emoji.demojize(tweet)

    return tweet


In [None]:
#You apply the basic_cleaning function to all the tweets
tweets = [basic_cleaning(tweet) for tweet in tweets]

#And display the results
for i,k in enumerate(tweets[0:10]): print("Tweet " + str(i+1) + ":",  k, "(" + str(df['airline_sentiment'].to_list()[i]) + ")\n")

# What do we do with emojis?

In [None]:
#the library that is gonna do the translation of complex emojis for us
import emoji

#Translating simple emojis
def translating_emojis(tweet):

    #Dealing with happy emojis
    tweet = tweet.replace(":d","happy_face")
    tweet = tweet.replace(":)","happy_face")
    tweet = tweet.replace(";d","happy_face")
    tweet = tweet.replace(";)","happy_face")
    tweet = tweet.replace(";p","happy_face")
    tweet = tweet.replace(":p","happy_face")

    #Dealing with sad emojis
    tweet = tweet.replace(":(","unhappy_face")
    tweet = tweet.replace(":/","unhappy_face")


    #we use the emoji library for dealing with complex emojis
    tweet = emoji.demojize(tweet)

    return tweet


In [None]:
#Apply emoji translator to all tweets
tweets = [translating_emojis(tweet) for tweet in tweets]

for i,k in enumerate(tweets[0:10]): print(i+1,  k, "(" + str(df['airline_sentiment'].to_list()[i]) + ")\n")

# Splitting a String of Text into Words

In [None]:
def tokenise_tweets(tweet):

    tweet = tweet.split()

    return tweet

In [None]:
#Apply tokeniserr to all tweets
tweets = [tokenise_tweets(tweet) for tweet in tweets]

for i,k in enumerate(tweets[0:10]): print(i+1,  k, "(" + str(df['airline_sentiment'].to_list()[i]) + ")\n")

In [None]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
from nltk.corpus import stopwords

def advance_cleaning(tweet):

    #Remove stopwords in list stopwords.words('english')
    tweet = [word for word in tweet if not word in stopwords.words('english')]


    #Lemmatizing verbs
    tweet = [wnl.lemmatize(word, pos='v') for word in tweet]
    #Lemmatizing nouns
    tweet = [wnl.lemmatize(word, pos='n') for word in tweet]
    #Lemmatizing adjectives
    tweet = [wnl.lemmatize(word, pos='a') for word in tweet]
    tweet = [wnl.lemmatize(word, pos='s') for word in tweet]
    #Lemmatizing adverbs
    tweet = [wnl.lemmatize(word, pos='r') for word in tweet]



    return tweet

In [None]:
#Apply advance cleaning to all tweets
tweets = [advance_cleaning(tweet) for tweet in tweets]

for i,k in enumerate(tweets[0:10]): print(i+1,  k, "(" + str(df['airline_sentiment'].to_list()[i]) + ")\n")

In [None]:
#We put the preprocessed tweets back to the dataset as a new column
df['preprocessed_tweets'] = tweets

#Our dataset now looks like this
print("Our dataset now has a new column called preprocessed_tweets")
df.head(3)


# Some visual exploration

In [None]:
import itertools

all_tweets = list(itertools.chain.from_iterable(df['preprocessed_tweets'].to_list()))


positive_df = df[df['airline_sentiment'] == 'positive']
negative_df = df[df['airline_sentiment'] == 'negative']

positive_tweets = list(itertools.chain.from_iterable(positive_df['preprocessed_tweets'].to_list()))
negative_tweets = list(itertools.chain.from_iterable(negative_df['preprocessed_tweets'].to_list()))


### Exploring the length

In [None]:
list_of_lens = [len(a_list) for a_list in df['preprocessed_tweets'].to_list()]

average = sum(list_of_lens) / len(list_of_lens)

print("Average length of tweets" , average, "words")

fig = plt.figure()
plt.hist(list_of_lens, density=False, bins=20)
plt.ylabel('Frequency')
plt.xlabel("Len of Tweets")
plt.title("Tweet Length")
fig.savefig("len_plots/tweet_lens.png")

### Vocab of positive tweets

In [None]:
from nltk import FreqDist
import matplotlib.pyplot as plt

print("Exploring positive tweets")

fdist = FreqDist(positive_tweets)

print("Total number of words ", len(positive_tweets))
print("Number of unique vocabulary items ", len(set(positive_tweets)))
print("10 most common vocabulary items ", fdist.most_common(10))

fig = plt.figure(figsize=(10, 4))
fdist.plot(50, cumulative=False)
fig.suptitle('positive', fontsize=16)
fig.savefig('freq_dist_plots/freqDistpositive.png', bbox_inches="tight")

### Vocab of negative Tweets

In [None]:
print("Exploring negative tweets")

fdist = FreqDist(negative_tweets)

print("Total number of words ", len(negative_tweets))
print("Number of unique vocabulary items ", len(set(negative_tweets)))
print("10 most common vocabulary items ", fdist.most_common(10))

fig = plt.figure(figsize=(10, 4))
fdist.plot(50, cumulative=False)
fig.suptitle('positive', fontsize=16)
fig.savefig('freq_dist_plots/freqDistpositive.png', bbox_inches="tight")

### POS Tags

In [None]:
from nltk.tag import map_tag
from collections import Counter

tags = nltk.pos_tag(all_tweets)

simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in tags]

counts = Counter(tag for word, tag in simplifiedTags)

# Counter data, counter is your counter object
keys = counts.keys()
y_pos = np.arange(len(keys))
# get the counts for each key, assuming the values are numerical
frequency = [counts[k] for k in keys]


plt.barh(y_pos, frequency, align='center', alpha=0.4)
plt.yticks(y_pos, keys)
plt.xlabel('Frequency')
plt.ylabel('POS tag')
plt.title('Most common POS tags ')

plt.show()