<a href="https://colab.research.google.com/github/jrakhshanda/Text-Mining/blob/main/Text_mining_report/word_counts_approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### This is a simple solution using only word counts with CountVectorizer to make predictions.

#### Here's the idea:
- Find and weight words that are used most often in only certain kinds of tweets.
- Search all subsets of the tweet and calculate a score based on these weights.
- For positive or negative tweets, the selected text is the most highly weighted subset, within some threshold.
- Always return the entire text for neutral tweets.

In [12]:
import re
import string
import numpy as np 
import random
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from sklearn.feature_extraction.text import CountVectorizer


import nltk
from nltk.corpus import stopwords

from tqdm import tqdm
import os
from sklearn import model_selection, metrics
import nltk
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch

import nltk
nltk.download('stopwords')

from sklearn import model_selection

import warnings
warnings.filterwarnings("ignore")

import os


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# Import datasets
df = pd.read_csv('/content/drive/MyDrive/train.csv', keep_default_na=False)

In [7]:
train[train['text'].isna()]

Unnamed: 0,textID,text,selected_text,sentiment


Break up the training data into datasets where the sentiment is positive, neutral, or negative

In [9]:
# Make all the text lowercase - casing doesn't matter when 
# we choose our selected text.
train['text'] = train['text'].apply(lambda x: x.lower())

# Make training/test split
from sklearn.model_selection import train_test_split

X_train, X_val = train_test_split(
    train, train_size = 0.80, random_state = 0)

In [10]:
pos_train = X_train[X_train['sentiment'] == 'positive']
neutral_train = X_train[X_train['sentiment'] == 'neutral']
neg_train = X_train[X_train['sentiment'] == 'negative']

### Algorithm for weight calculation:

1. For each class $j \in \{positive, neutral, negative\}$

    a. Find all the words $i$ in the tweets belonging to class $j$.

    b. Calculate $n_{i, j} =$ the number of tweets in class $j$ containing word $i$. 

    c. Let $d_j$ be the number of tweets in class $j$.  Calculate $p_{i, j} = \frac{n_{i, j}}{d_j}$, the proportion of tweets in class $j$ that conain word $i$.

    d. Let $w_{i, j} = p_{i, j} - \sum\limits_{k \neq j}p_{i, k}$ be the weights assigned to each word within each class. 
    

In [13]:
# Use CountVectorizer to get the word counts within each dataset

cv = CountVectorizer(max_df=0.95, min_df=2,
                                     max_features=10000,
                                     stop_words='english')

X_train_cv = cv.fit_transform(X_train['text'])

X_pos = cv.transform(pos_train['text'])
X_neutral = cv.transform(neutral_train['text'])
X_neg = cv.transform(neg_train['text'])

pos_count_df = pd.DataFrame(X_pos.toarray(), columns=cv.get_feature_names())
neutral_count_df = pd.DataFrame(X_neutral.toarray(), columns=cv.get_feature_names())
neg_count_df = pd.DataFrame(X_neg.toarray(), columns=cv.get_feature_names())

# Create dictionaries of the words within each sentiment group, where the values are the proportions of tweets that 
# contain those words

pos_words = {}
neutral_words = {}
neg_words = {}

for k in cv.get_feature_names():
    pos = pos_count_df[k].sum()
    neutral = neutral_count_df[k].sum()
    neg = neg_count_df[k].sum()
    
    pos_words[k] = pos/pos_train.shape[0]
    neutral_words[k] = neutral/neutral_train.shape[0]
    neg_words[k] = neg/neg_train.shape[0]
    
# We need to account for the fact that there will be a lot of words used in tweets of every sentiment.  
# Therefore, we reassign the values in the dictionary by subtracting the proportion of tweets in the other 
# sentiments that use that word.

neg_words_adj = {}
pos_words_adj = {}
neutral_words_adj = {}

for key, value in neg_words.items():
    neg_words_adj[key] = neg_words[key] - (neutral_words[key] + pos_words[key])
    
for key, value in pos_words.items():
    pos_words_adj[key] = pos_words[key] - (neutral_words[key] + neg_words[key])
    
for key, value in neutral_words.items():
    neutral_words_adj[key] = neutral_words[key] - (neg_words[key] + pos_words[key])

### Algorithm for finding selected text: 
  
1. For every tweet:

    a. Let $j$ be the sentiment of the tweet. 

    b. If $j ==$ neutral return entire text.

    c. Otherwise, for each subset of words in the tweet, calculate $\sum\limits_{i}w_{i, j}$, where $i$ is the set of words in the tweet

   d. Return the subset of words with the largest sum, given that it exceeds some tolerance.

In [14]:
def calculate_selected_text(df_row, tol = 0):
    
    tweet = df_row['text']
    sentiment = df_row['sentiment']
    
    if(sentiment == 'neutral'):
        return tweet
    
    elif(sentiment == 'positive'):
        dict_to_use = pos_words_adj # Calculate word weights using the pos_words dictionary
    elif(sentiment == 'negative'):
        dict_to_use = neg_words_adj # Calculate word weights using the neg_words dictionary
        
    words = tweet.split()
    words_len = len(words)
    subsets = [words[i:j+1] for i in range(words_len) for j in range(i,words_len)]
    
    score = 0
    selection_str = '' # This will be our choice
    lst = sorted(subsets, key = len) # Sort candidates by length
    
    
    for i in range(len(subsets)):
        
        new_sum = 0 # Sum for the current substring
        
        # Calculate the sum of weights for each word in the substring
        for p in range(len(lst[i])):
            if(lst[i][p].translate(str.maketrans('','',string.punctuation)) in dict_to_use.keys()):
                new_sum += dict_to_use[lst[i][p].translate(str.maketrans('','',string.punctuation))]
            
        # If the sum is greater than the score, update our current selection
        if(new_sum > score + tol):
            score = new_sum
            selection_str = lst[i]
            #tol = tol*5 # Increase the tolerance a bit each time we choose a selection

    # If we didn't find good substrings, return the whole text
    if(len(selection_str) == 0):
        selection_str = words
        
    return ' '.join(selection_str)

Calculate the selected text and score for the validation set.

In [15]:
pd.options.mode.chained_assignment = None

In [16]:
tol = 0.001

X_val['predicted_selection'] = ''

for index, row in X_val.iterrows():
    
    selected_text = calculate_selected_text(row, tol)
    
    X_val.loc[X_val['textID'] == row['textID'], ['predicted_selection']] = selected_text

In [28]:
X_val.head()

Unnamed: 0,textID,text,selected_text,sentiment,predicted_selection,jaccard
8243,0565c06b55,glad you are having a blast.,Glad,positive,glad,1.0
12902,93d46ca978,when you haven`t had one in over a week! that...,mean,negative,when you haven`t had one in over a week! that ...,0.052632
6708,1f30caf2c0,thanks glad you like it,thanks,positive,thanks glad,0.5
26898,2f72905dbd,its 1:11am and both my girls are still up! .. ...,im so tired,negative,tired,0.333333
20706,bd0450f89b,my skin is burning up so much,burning,negative,my skin is burning up so much,0.142857


In [17]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [25]:
X_val['jaccard'] = X_val.apply(lambda x: jaccard(x['selected_text'], x['predicted_selection']), axis = 1)

print('The jaccard score for the validation set is:', np.mean(X_val['jaccard']))
jaccs = np.mean(X_val['jaccard'])

The jaccard score for the validation set is: 0.651373471729071


In [26]:
def evaluation_report(gold, pred):
    """Print precision, recall, and F1 score.
    
    Args:
        gold: The set with the gold-standard values.
        pred: The set with the predicted values.
    
    Returns:
        Nothing, but prints the precision, recall, and F1 values computed
        based on the specified sets.
    """
    gold = set(gold)
    pred = set(pred)
    # TODO: Replace the next line with your own code
    precision = len(gold.intersection(pred))/len(pred)
    recall = len(gold.intersection(pred))/len(gold)
    F1_score = 2 * (precision*recall) / (precision + recall)
    df = pd.DataFrame(data=[{'precision': precision, 'recall':recall, 'F1_score':F1_score, 'jaccard_similarity': jaccs}]) 

    return df

In [29]:
evaluation_report(str(X_val['selected_text']),str(X_val['predicted_selection']))

Unnamed: 0,precision,recall,F1_score,jaccard_similarity
0,0.930233,0.888889,0.909091,0.651373
