In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
messages_df = pd.read_csv('CONVERSATIONNAME.csv',parse_dates=['Date'])

# Enrich the df

In [None]:
messages_df['Message'] = messages_df['Message'].astype(str)
messages_df['Letter_Count'] = messages_df['Message'].apply(lambda s : len(str(s)))
messages_df['Word_Count'] = messages_df['Message'].apply(lambda s : len(str(s).split(' ')))

In [None]:
messages_df['Hour'] =messages_df.Date.dt.hour

# Graph 

### Number of words each person wrote 

In [None]:
total_word_count_grouped_by_author = messages_df[['Author', 'Word_Count']].groupby('Author').sum()
sorted_total_word_count_grouped_by_author = total_word_count_grouped_by_author.sort_values('Word_Count', ascending=False)
top_10_sorted_total_word_count_grouped_by_author = sorted_total_word_count_grouped_by_author.head(10)
top_10_sorted_total_word_count_grouped_by_author.plot.barh()
plt.xlabel('Number of Words')
plt.ylabel('Authors')

### Number of letters each person wrote

In [None]:
total_letter_count_grouped_by_author = messages_df[['Author', 'Letter_Count']].groupby('Author').sum()
sorted_total_letter_count_grouped_by_author = total_letter_count_grouped_by_author.sort_values('Letter_Count', ascending=False)
top_10_sorted_total_letter_count_grouped_by_author = sorted_total_letter_count_grouped_by_author.head(10)
top_10_sorted_total_letter_count_grouped_by_author.plot.barh()
plt.xlabel('Number of Letters')
plt.ylabel('Authors')

### How many words we use per message?

In [None]:
plt.figure(figsize=(15, 2)) # To ensure that the bar plot fits in the output cell of a Jupyter notebook
word_count_value_counts = messages_df['Word_Count'].value_counts()
top_40_word_count_value_counts = word_count_value_counts.head(40)
top_40_word_count_value_counts.plot.bar()
plt.xlabel('Word Count')
plt.ylabel('Frequency')

### When do people wrote more?

In [None]:
messages_df['Hour'].value_counts().head(24).sort_index(ascending=False).plot.barh() # Top 10 Hours of the day during which the most number of messages were sent
plt.xlabel('Number of messages')
plt.ylabel('Hour of Day')

### NLTK

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import WhitespaceTokenizer
from nltk.probability import FreqDist
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

stop_words=set(stopwords.words("english"))


# Use English stemmer.
stemmer = SnowballStemmer("english")
w_tokenizer = WhitespaceTokenizer()
def stemm_texts(text):
    return ' '.join([stemmer.stem(w) for w in w_tokenizer.tokenize(str(text)) if w not in stop_words] )


messages_df['steamed'] = messages_df['Message'].apply(stemm_texts)

In [None]:
messages_df.head()

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

text = " ".join(review for review in messages_df.steamed)

# Create stopword list:
stopwords = set(STOPWORDS)
stopwords.update(["drink", "now", "wine", "flavor", "flavors","animated_media"])

# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)

# Display the generated image:
# the matplotlib way:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()