In [225]:
import pandas as pd
from pathlib import Path
import re
from nltk.tokenize import TweetTokenizer
import collections

### Load data

In [226]:
# Load data
# test_df = pd.read_csv(Path('dataset/test.csv'))

In [227]:
train_df = pd.read_csv(Path('dataset/train.csv'))

In [228]:
train_df.head()

Unnamed: 0,sentiment,text
0,5,@manjulamartin @Kirk_Gleason Except trains are...
1,5,I want a Google driverless car.
2,5,@Oatmeal @google driverless @TeslaMotors ? Ooo...
3,5,SO MUCH AWESOME! Amazing video for GoogleÌ¢‰âÂ...
4,5,@google is making driverless cars which is awe...


In [229]:
# train_df.text[3]

### Task 1: remove emoji

In [230]:
def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)\s', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    
    tweet = tweet.replace(' EMO_POS ', ' ')
    tweet = tweet.replace(' EMO_NEG ', ' ')

    return tweet

In [231]:
def check_diff(before_df, after):
    # check
    count = 0
    for index, row in before_df.iterrows():
        if row['text'] != after[index]:
    #         print(row['text'])
    #         print(text[index])
            count += 1
    return count    

In [232]:
# temp = train_df.processed_text.apply(handle_emojis)
# count = check_diff(train_df, temp)
# print('Number of tweets contains emoji: ', count)

In [233]:
train_df.text = train_df.text.apply(handle_emojis)

### Task 2: split al URLS out off the main texts

In [234]:
def handle_url(tweet):
    tweet = re.sub(r"http\S+", "", tweet)
    return tweet

In [235]:
temp = train_df.text.apply(handle_url)
count = check_diff(train_df, temp)
print('Number of tweets contains URL: ', count)

Number of tweets contains URL:  549


In [236]:
train_df.text = train_df.text.apply(handle_url)

### Task 3: remove all non-ASCII characters

In [237]:
def handle_non_ASCII(tweet):
    return ''.join([i if ord(i) < 128 else ' ' for i in tweet])

In [238]:
temp = train_df.text.apply(handle_non_ASCII)
count = check_diff(train_df, temp)
print('Number of tweets contains non-ASCII: ', count)

Number of tweets contains non-ASCII:  176


In [239]:
train_df.text = train_df.text.apply(handle_non_ASCII)

### Task 4: remove all numbers

In [240]:
def handle_numbers(tweet):
    # normal numbers
    tweet = re.sub(r"^\d+\s|\s\d+\s|\s\d+$", " ", tweet)
    # money
    return tweet

In [241]:
train_df.text = train_df.text.apply(handle_numbers)

### Task 5: remove stopwords

In [242]:
stopwords_df = pd.read_csv(Path('dataset/stopwords.csv'), header=None, names=['text'])

In [243]:
stopwords =[row['text'] for index, row in stopwords_df.iterrows()]

In [244]:
def remove_stopwords(tweet):
    tknzr = TweetTokenizer()
    word_list = tknzr.tokenize(tweet)
    word_list = [w for w in word_list if not w in stopwords]
    return ' '.join(word_list)

In [245]:
temp = train_df.text.apply(remove_stopwords)
count = check_diff(train_df, temp)
print('Number of tweets contains stopwords: ', count)

Number of tweets contains stopwords:  980


In [246]:
train_df.text = train_df.text.apply(remove_stopwords)

### Task 6: split out all hashtags and store all of the hashtags in txt file

In [247]:
def extract_hashtags(tweet):
    hashtags = re.findall(r'#(\S+)', tweet)
    return tweet, hashtags

In [248]:
def handle_hashtags(df):
    hashtags = list()
    for index, row in df.iterrows():
        tweet, ht = extract_hashtags(row['text'])
        hashtags.extend(ht)
        df.loc[index, 'text'] = tweet
    return df, hashtags

In [249]:
train_df, hashtags = handle_hashtags(train_df)

In [250]:
with open("hashtags.txt", "w") as fp:
    fp.write("\n".join(set(hashtags)))

### Task 7: print out 10 most popular hastags with counts

In [251]:
hashtags = [str.lower(h) for h in hashtags]
c = collections.Counter(hashtags)
c.most_common(10)

[('driverless', 30),
 ('google', 23),
 ('cars', 10),
 ('cas13', 8),
 ('codecon', 6),
 ('tech', 5),
 ('snbto', 4),
 ('toronto', 4),
 ('technology', 4),
 ('cbcmtl', 4)]

### Task 8: remove all characters outside the alphabet system, except the whitespaces

In [252]:
def remove_non_alphabet(tweet):
    regex = re.compile('[^a-zA-Z\s]')
    tweet = regex.sub('', tweet)
    return tweet

In [253]:
train_df.text = train_df.text.apply(remove_non_alphabet)

### Task 9: combine all

In [262]:
def preprocess(train_path):
    train_df = pd.read_csv(Path(train_path))
    train_df.text = train_df.text.apply(handle_emojis)
    train_df.text = train_df.text.apply(handle_url)
    train_df.text = train_df.text.apply(handle_non_ASCII)
    train_df.text = train_df.text.apply(handle_numbers)
    train_df.text = train_df.text.apply(remove_stopwords)
    train_df, _ = handle_hashtags(train_df)
    train_df.text = train_df.text.apply(remove_non_alphabet)
    train_df.to_csv(Path(train_path + '.preprocess'), index=False)
    return train_df

In [263]:
preprocess_df = preprocess('dataset/train.csv')

### Task 10

In [259]:
import nltk
from nltk.stem.snowball import SnowballStemmer
import re
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
stemmer = SnowballStemmer("english")

def tokenize_and_strip(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        token = token.strip(string.punctuation + '…')
        if len(token) == 1:
            continue
        if stemmer.stem(token) in english_vocab and 'sex' not in token:
            continue
        
        # money/time
        if re.match(r'(\d{3,}k)', token) is not None:
            continue
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [277]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import string

In [279]:
# cov_types=['spherical', 'diag', 'tied', 'full']
# spherical gives bad results because it works like k-mean
def gmm_clustering(n_components, X, y, stopwords, use_idf=True, cov_types=['diag', 'tied', 'full']):    
    print('==========encode labels==========')
    le = preprocessing.LabelEncoder()
    encoded_labels = le.fit_transform(y)
    
    actual_labels = le.inverse_transform(list(range(n_components)))
    label2index = dict()
    for i in range(n_components):
        label2index[actual_labels[i]] = i
        print('encoded label: %s, actual label: %s' % (i, actual_labels[i]))

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        encoded_labels, test_size=0.33, random_state=42)
        
    print('==========calcuate tfidf matrix===========')
    #define vectorizer parameters
    # max_df=0.8, min_df=0.2, 
    tfidf_vectorizer = TfidfVectorizer(max_features=200000,
                                       min_df=0.1, stop_words=stopwords,
                                       use_idf=use_idf, tokenizer=tokenize_and_strip, ngram_range=(1, 3))
    X_train = tfidf_vectorizer.fit_transform(X_train).toarray() #fit the vectorizer to synopses
    print('X_train.shape: ', X_train.shape)
    
    X_test = tfidf_vectorizer.transform(X_test).toarray()
    print('X_test.shape: ', X_test.shape)
    
    n_classes = n_components

    # Try GMMs using different types of covariances.
    estimators = dict((cov_type, GaussianMixture(n_components=n_classes,
                       covariance_type=cov_type, max_iter=100, random_state=0))
                      for cov_type in cov_types)

    print('==========Cluster==========')
    train_pred_probs = dict()
    test_pred_probs = dict()
    for index, (name, estimator) in enumerate(estimators.items()):
        print('\n--------Cov type: %s----------' % str.upper(name))
        # Since we have class labels for the training data, we can
        # initialize the GMM parameters in a supervised manner.
        estimator.means_init = np.array([X_train[y_train == i].mean(axis=0)
                                        for i in range(n_classes)])

        # Train the other parameters using the EM algorithm.
        estimator.fit(X_train)

        y_train_pred = estimator.predict(X_train)
        train_pred_probs[name] = estimator.predict_proba(X_train)
        
        train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100
        print('Train accuracy: %.1f' % train_accuracy)
        for i in range(n_classes):
            train_accuracy = np.mean(y_train_pred[y_train == i].ravel() == i) * 100
            print('\tclass-%s: %.1f' % (i, train_accuracy))
            
        print('Train Precision:')
        for i in range(n_classes):
            tp = sum(y_train_pred[y_train == i].ravel() == i)
            train_precision = tp * 100. / sum(y_train_pred == i)
            print('\tclass-%s: %.1f' % (i, train_precision))
            
        print('Train Recall:')
        for i in range(n_classes):
            tp = sum(y_train_pred[y_train == i].ravel() == i)
            train_recall = tp * 100. / sum(y_train == i)
            print('\tclass-%s: %.1f' % (i, train_recall))
            
        print('Confusion_matrix: \n', confusion_matrix(y_train, y_train_pred))

        y_test_pred = estimator.predict(X_test)
        test_pred_probs[name] = estimator.predict_proba(X_test)
        test_accuracy = np.mean(y_test_pred.ravel() == y_test.ravel()) * 100
        print('\nTest accuracy: %.1f' % test_accuracy)
        for i in range(n_classes):
            test_accuracy = np.mean(y_test_pred[y_test == i].ravel() == i) * 100
            print('\tclass-%s: %.1f' % (i, test_accuracy))
            
        print('Test Precision:')
        for i in range(n_classes):
            tp = sum(y_test_pred[y_test == i].ravel() == i)
            test_precision = tp * 100. / sum(y_test_pred == i)
            print('\tclass-%s: %.1f' % (i, test_precision))
            
        print('Test Recall:')
        for i in range(n_classes):
            tp = sum(y_test_pred[y_test == i].ravel() == i)
            test_recall = tp * 100. / sum(y_test == i)
            print('\tclass-%s: %.1f' % (i, test_recall))

    return estimators, tfidf_vectorizer

In [274]:
contents = preprocess_df['text'].tolist()
true_labels = preprocess_df['sentiment'].tolist()
# for i in range(len(true_labels)):
#     if true_labels[i] in [4, 5]:
#         true_labels[i] = 'positive'
#     elif true_labels[i] in [3]:
#         true_labels[i] = 'neutral'
#     else:
#         true_labels[i] = 'negative'

In [270]:
len(contents)

981

In [275]:
set(true_labels)

{'negative', 'neutral', 'positive'}

In [280]:
estimators, tfidf_vectorizer = gmm_clustering(n_components=3, X=contents, y=true_labels, 
                                              stopwords=stopwords, cov_types=['spherical', 'diag', 'tied', 'full'])

encoded label: 0, actual label: negative
encoded label: 1, actual label: neutral
encoded label: 2, actual label: positive


  if diff:


X_train.shape:  (657, 1)
X_test.shape:  (324, 1)

--------Cov type: DIAG----------
Train accuracy: 19.9
	class-0: 85.3
	class-1: 0.0
	class-2: 29.9
Train Precision:
	class-0: 15.7
	class-1: nan
	class-2: 35.7
Train Recall:
	class-0: 85.3
	class-1: 0.0
	class-2: 29.9
Confusion_matrix: 
 [[ 81   0  14]
 [319   0  76]
 [117   0  50]]

Test accuracy: 16.7
	class-0: 86.7
	class-1: 0.0
	class-2: 21.1
Test Precision:
	class-0: 15.5
	class-1: nan
	class-2: 20.8
Test Recall:
	class-0: 86.7
	class-1: 0.0
	class-2: 21.1

--------Cov type: SPHERICAL----------
Train accuracy: 19.9
	class-0: 85.3
	class-1: 0.0
	class-2: 29.9
Train Precision:
	class-0: 15.7
	class-1: nan
	class-2: 35.7
Train Recall:
	class-0: 85.3
	class-1: 0.0
	class-2: 29.9
Confusion_matrix: 
 [[ 81   0  14]
 [319   0  76]
 [117   0  50]]

Test accuracy: 16.7
	class-0: 86.7
	class-1: 0.0
	class-2: 21.1
Test Precision:
	class-0: 15.5
	class-1: nan
	class-2: 20.8
Test Recall:
	class-0: 86.7
	class-1: 0.0
	class-2: 21.1

--------Cov t



As we can see that GMM model does not works well on this dataset

In [282]:
def predict(test_path, estimator, tfidf_vectorizer):
    preprocess_df = preprocess(test_path)
    contents = preprocess_df['text'].tolist()
    X_train = tfidf_vectorizer.fit_transform(X_train).toarray() #fit the vectorizer to synopses
    result = list()

In [284]:
# predict('dataset/test_path.csv', estimators['diag'], tfidf_vectorizer)