In [None]:
# Importing data sets
import pandas as pd
train_data = pd.read_csv('Fix_The_Country_Train_data.csv')
test_data = pd.read_csv('Fix_The_Country_Test_data.csv') 

In [None]:
# Training data details
train_data.info()
train_data.head(20)

In [None]:
# Testing data details
test_data.info()
test_data.head(20)

In this competition, the aim is to identify/extract "selected_text" from the "text" field of the test data set. Only after that, we will be able to cross-check and verify whether sentiment extracted in "sentiment" column of the test data set is correct or not. 

In [None]:
# Row counts where missing value is present in Train data
print(train_data.notnull().sum())
print(train_data.isnull().sum())

There is one row in the training data set which has its "text" and "selected text" missing. We can discard that.

In [None]:
train_data.dropna(axis = 0,inplace=True)

In [None]:
# Row counts where missing value is present in Test data
print(test_data.notnull().sum())
print(test_data.isnull().sum())

In [None]:
# plot frequency of positive, negative and neutral sentiments in Train Data
from matplotlib import pyplot as plt
count_sentiments = pd.value_counts(train_data['Sentiment'], sort=True)
count_sentiments.plot(kind='bar', color=(['green','red','orange']), alpha=0.8, rot=0)
plt.title("Distribution of Sentiment Types in Train Data")
plt.xticks(range(3), ['Positive', 'Negative', 'Neutral'])
plt.xlabel("Sentiment Type")
plt.ylabel("Frequency")
plt.show()

In [None]:
# plot frequency of positive, negative and neutral sentiments in Test Data
from matplotlib import pyplot as plt
count_sentiments_te = pd.value_counts(test_data['Sentiment'], sort=True)
count_sentiments_te.plot(kind='bar', color=(['green','red','orange']), alpha=0.8, rot=0)
plt.title("Distribution of Sentiment Types in Test Data")
plt.xticks(range(3), ['positive', 'negative', 'neutral'])
plt.xlabel("Sentiment Type")
plt.ylabel("Frequency")
plt.show()

In both train and test datasets, no. of positive tweets are higher than no. of negative and neutral tweets. 

In [None]:
# Removes punctuation from text. Convert entire text to lower case.
import string
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

train_data['s_text_clean'] = train_data['content'].apply(str).apply(lambda x: remove_punctuation(x.lower()))
train_data.head(20)

In [None]:
# Breaks up entire string into a list of words based on a pattern specified by the Regular Expression
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')  
train_data['s_text_tokens'] = train_data['s_text_clean'].apply(str).apply(lambda x: tokenizer.tokenize(x))
train_data.head(20)

In [None]:
# Remove stopwords
from nltk.corpus import stopwords
def remove_stopwords(text):
    words = [w for w in text if (w not in stopwords.words('english') or w not in 'im')]
    return words

train_data['s_text_tokens_NOTstop'] = train_data['s_text_tokens'].apply(lambda x: remove_stopwords(x))
train_data.head(20)

In [None]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text

train_data['s_text_lemma'] = train_data['s_text_tokens_NOTstop'].apply(lambda x: word_lemmatizer(x))
train_data.head(20)

In [None]:
# Stemming
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def word_stemmer(text):
    stem_text = " ".join([stemmer.stem(i) for i in text])
    return stem_text

train_data['s_text_stem'] = train_data['s_text_lemma'].apply(lambda x: word_stemmer(x))
train_data.head(20)

In [None]:
import seaborn as sns

def unique_words_analysis(df):
    fig,ax = plt.subplots(1,3, figsize=(16,4))
    for i,s in enumerate(sentiment):
        new = train_data[train_data['Sentiment']==s]['s_text_stem'].map(lambda x: len(set(x.split())))
        if (s =='Positive'):
            sns.distplot(new.values, ax = ax[i], color='green', rug=True)
        if (s =='Neutral'):
            sns.distplot(new.values, ax = ax[i], color='orange', rug=True)
        if (s =='Negative'):
            sns.distplot(new.values, ax = ax[i], color='red', rug=True)
        ax[i].set_title(s)
    fig.suptitle('Distribution of number of unique words')
    fig.show()

unique_words_analysis(train_data)

We observe that both positive and negative tweets' no. of unique words follow almost the similar pattern of distribution (positively skewed). Though neutral tweets also follow a positively skewed distribution, it has a more wide spread as compared to the spread of other two types.

In [None]:
# Segregating positive, negative, neutral sentiment data
positive_train = train_data[train_data['Sentiment']=='Positive']
neutral_train = train_data[train_data['Sentiment']=='Neutral']
negative_train = train_data[train_data['Sentiment']=='Negative']

In [None]:
# Common Word frequency analysis for positive text
from nltk.probability import FreqDist
import pandas as pd
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

fdist_pos = FreqDist(positive_train['s_text_stem'])
top_twen_pos = fdist_pos.most_common(20)
#top_ten_pos

df1 = pd.DataFrame(top_twen_pos, columns = ['Text' , 'count'])
df1.groupby('Text').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', color='green', linecolor='black', title='Top 20 Common Words in positive text',orientation='v')

In [None]:
# Common Word frequency analysis for neutral text

fdist_neu = FreqDist(neutral_train['s_text_stem'])
top_twen_neu = fdist_neu.most_common(20)

df2 = pd.DataFrame(top_twen_neu, columns = ['Text' , 'count'])
df2.groupby('Text').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', color='orange', linecolor='black', title='Top 20 Common Words in neutral text',orientation='v')

In [None]:
# Common Word frequency analysis for negative text

fdist_neg = FreqDist(negative_train['s_text_stem'])
top_twen_neg = fdist_neg.most_common(20)

df3 = pd.DataFrame(top_twen_neg, columns = ['Text' , 'count'])
df3.groupby('Text').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', color='red', linecolor='black', title='Top 20 Common Words in negative text',orientation='v')

Exploratory data analysis ends here. 