In [55]:
import pandas as pd
import re
import xlsxwriter

import nltk
from nltk.util import ngrams
from nltk import FreqDist

In [56]:
# Download for tokenize functionality

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/kurt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [57]:
# Import csv
df = pd.read_csv("input.csv", header=None)

# Get Series
tweet_list = df[0]
tweet_list.head()

0                  @VirginAmerica What @dhepburn said.
1    @VirginAmerica plus you've added commercials t...
2    @VirginAmerica I didn't today... Must mean I n...
3    @VirginAmerica it's really aggressive to blast...
4    @VirginAmerica and it's a really big bad thing...
Name: 0, dtype: object

In [58]:
# Clean tweets - removes any HTML and punctuation

clean_tweets = []

for t in tweet_list:
    t = re.sub(r'(<br />)|(<br>)|(\\n)|(\\r)|-', " ", str(t))
    t = re.sub(r'[\.,()+…?!"/:@\'“]', "", str(t))
    t = re.sub(r'(&amp;)', "and", str(t))
    t = " ".join(t.split())
    clean_tweets.append(t)
    
clean_tweets[3]

'VirginAmerica its really aggressive to blast obnoxious entertainment in your guests faces and they have little recourse'

In [59]:
# Join into one string and change to lower case
# Tokenize 

all_tweets = " ".join(clean_tweets).lower()
token_tweets = nltk.word_tokenize(all_tweets)

In [60]:
# Create ngrams and frequency distributions

tweet_sgrams = ngrams(token_tweets, 1)
tweet_bigrams = ngrams(token_tweets, 2)
tweet_trigrams = ngrams(token_tweets, 3)
tweet_quadgrams = ngrams(token_tweets, 4)
tweet_pentgrams = ngrams(token_tweets, 5)

tweet_fds = FreqDist(tweet_sgrams)
tweet_fdb = FreqDist(tweet_bigrams)
tweet_fdt = FreqDist(tweet_trigrams)
tweet_fdq = FreqDist(tweet_quadgrams)
tweet_fdp = FreqDist(tweet_pentgrams)

In [61]:
# Convert to DF

df_tweet_sgram = pd.DataFrame.from_dict(tweet_fds, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)
df_tweet_bigram = pd.DataFrame.from_dict(tweet_fdb, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)
df_tweet_trigram = pd.DataFrame.from_dict(tweet_fdt, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)
df_tweet_quadgram = pd.DataFrame.from_dict(tweet_fdq, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)
df_tweet_pentgram = pd.DataFrame.from_dict(tweet_fdp, orient='index', columns=['freq']).reset_index().sort_values(by=['freq'], ascending=False)

In [62]:
df_tweet_quadgram.head(10)

Unnamed: 0,index,freq
116501,"(our, fleets, on, fleek)",144
116500,"(jetblue, our, fleets, on)",137
36291,"(been, on, hold, for)",113
19079,"(ive, been, on, hold)",59
117481,"(rt, jetblue, our, fleets)",50
2908,"(thank, you, for, the)",50
10877,"(flight, was, cancelled, flightled)",48
24983,"(on, hold, for, over)",43
14420,"(for, over, an, hour)",41
3229,"(cancelled, flightled, my, flight)",30


In [64]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('ngrams_output.xlsx', engine='xlsxwriter')

# Write each dataframe to a different worksheet.
df_tweet_sgram.to_excel(writer, sheet_name='Book-Single')
df_tweet_bigram.to_excel(writer, sheet_name='Book-Bigram')
df_tweet_trigram.to_excel(writer, sheet_name='Book-Trigram')
df_tweet_quadgram.to_excel(writer, sheet_name='Book-Quadgram')
df_tweet_pentgram.to_excel(writer, sheet_name='Book-Pentgram')

# Close the Pandas Excel writer and output the Excel file.
writer.save()