# Import Statements

In [49]:
import pandas as pd # Imported to enable the use of datastructures like dataframe
import matplotlib.pyplot as plt # Imported to visusalise data
import seaborn as sns # Imported to visualise data
import numpy as np # Imported for calculations
import json
from nltk.tokenize import sent_tokenize, word_tokenize
import emoji
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer # Imported to convert raw documents into a matrix of tf idf features
from sklearn.linear_model import LogisticRegression # Imported to enable the use of logistic regression to classify text
from sklearn.model_selection import train_test_split # Imported to enable the user to split the data into train, test samples
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score # for report generation

In [5]:
local=False
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Able to open google drive!")
except:
    print("Not able to open google drive, opening files locally")
    local=True
else:
  print("Nothing went wrong") 

Not able to open google drive, opening files locally


In [6]:
if(not local):
    path_train = '/content/drive/MyDrive/Datasets/malayalam_train.tsv'
    path_val = '/content/drive/MyDrive/Datasets/malayalam_dev.tsv'
    path_test = '/content/drive/MyDrive/Datasets/malayalam_test.tsv'
else:
    path_train='/Users/sachin/Library/CloudStorage/GoogleDrive-heysachins@gmail.com/My Drive/Datasets/malayalam_train.tsv'
    path_test='/Users/sachin/Library/CloudStorage/GoogleDrive-heysachins@gmail.com/My Drive/Datasets/malayalam_test.tsv'
    path_val='/Users/sachin/Library/CloudStorage/GoogleDrive-heysachins@gmail.com/My Drive/Datasets/malayalam_dev.tsv'

In [7]:
print(path_test)

/Users/sachin/Library/CloudStorage/GoogleDrive-heysachins@gmail.com/My Drive/Datasets/malayalam_test.tsv


# Importing the dataset

In [8]:
df_train = pd.read_csv(path_train, sep='\t')
print(df_train)

df_val = pd.read_csv(path_val, sep='\t')
print(df_val)

df_test = pd.read_csv(path_test, sep='\t')
print(df_test)

                                                   text         category
0                hoo mammokka police vesham aaha anthas        Positive 
1        Oru rekshayum illa...kidilam kannu nananjupoyi        Positive 
2                             Ikka     waiting.........        Positive 
3                Raju Ettante Oro Shorttum Ijathi ppwli        Positive 
4      Ettan fansil netti poya aarenkilum undo?    #...        Positive 
...                                                 ...              ...
4846   Madhuraraja trailer Kand ivide vannanvar likkeee   unknown_state 
4847   Njn pru lalettan fan ahn..  eee trailer mass ...        Positive 
4848   Valiya pratheesha illa nalla entertainment  a...  Mixed_feelings 
4849   Dislike adikkunna kazhuthakalude mukhath adik...        Negative 
4850   Adipoli..... Pakshe oru sankadam ithinte thir...  Mixed_feelings 

[4851 rows x 2 columns]
                                                  text        category
0                           

# Exploring the dataset

In [26]:
print(df_train.shape)
print(df_test.shape)
print(df_val.shape)

total=df_val.shape[0]+df_test.shape[0]+df_train.shape[0]
print("Total = ",total)

(4851, 2)
(1348, 2)
(540, 2)
Total =  6739


In [29]:
try:
    df_test = df_test.drop('id', axis=1)
except:
    print("already dropped column")

df_dataset = pd.concat([df_train, df_val, df_test], ignore_index=True)
print(df_dataset.shape)

already dropped column
(6739, 2)


In [30]:
# Renaming (df_train) the classes for convenience

# Removing any leading/trailing spaces
df_dataset['category'] = df_dataset['category'].str.strip()

# Replacing 'unknown_state' with 'Irrelevant'
df_dataset['category'] = df_dataset['category'].replace({'unknown_state': 'Neutral'})
df_dataset['category'] = df_dataset['category'].replace({'not-malayalam': 'Not-Malayalam'})

# Viewing the number of items in each class after replacement
print(df_dataset['category'].value_counts())  # Used to view the number of items in each class.

## There is a significant imbalance in the classes in this dataset

Positive          2811
Neutral           1903
Not-Malayalam      884
Negative           738
Mixed_feelings     403
Name: category, dtype: int64


In [31]:
# Renaming (df_train) the classes for convenience

# Removing any leading/trailing spaces
df_train['category'] = df_train['category'].str.strip()

# Replacing 'unknown_state' with 'Irrelevant'
df_train['category'] = df_train['category'].replace({'unknown_state': 'Neutral'})
df_train['category'] = df_train['category'].replace({'not-malayalam': 'Not-Malayalam'})

# Viewing the number of items in each class after replacement
print(df_train['category'].value_counts())  # Used to view the number of items in each class.

## There is a significant imbalance in the classes in this dataset

Positive          2022
Neutral           1344
Not-Malayalam      647
Negative           549
Mixed_feelings     289
Name: category, dtype: int64


In [32]:
# Renaming (df_test) the classes for convenience

# Removing any leading/trailing spaces
df_test['category'] = df_test['category'].str.strip()

# Replacing 'unknown_state' with 'Irrelevant'
df_test['category'] = df_test['category'].replace({'unknown_state': 'Neutral'})
df_test['category'] = df_test['category'].replace({'not-malayalam': 'Not-Malayalam'})

# Viewing the number of items in each class after replacement
print(df_test['category'].value_counts())  # Used to view the number of items in each class.

## There is a significant imbalance in the classes in this dataset

Positive          565
Neutral           398
Not-Malayalam     177
Negative          138
Mixed_feelings     70
Name: category, dtype: int64


In [33]:
# Renaming (df_val) the classes for convenience

# Removing any leading/trailing spaces
df_val['category'] = df_val['category'].str.strip()

# Replacing 'unknown_state' with 'Irrelevant'
df_val['category'] = df_val['category'].replace({'unknown_state': 'Neutral'})
df_val['category'] = df_val['category'].replace({'not-malayalam': 'Not-Malayalam'})

# Viewing the number of items in each class after replacement
print(df_val['category'].value_counts())  # Used to view the number of items in each class.

## There is a significant imbalance in the classes in this dataset

Positive          224
Neutral           161
Not-Malayalam      60
Negative           51
Mixed_feelings     44
Name: category, dtype: int64


# Creating dataframes for each dataset (WholeDataset, Test, Train, Validation)

In [35]:
# Creating dataframes for (df_train) all categories for later use

df_dataset_positive_words = df_dataset[df_dataset['category']=='Positive']
df_dataset_negative_words = df_dataset[df_dataset['category']=='Negative']
df_dataset_mixed_feeling_words = df_dataset[df_dataset['category']=='Mixed_feelings']
df_dataset_neutral_words = df_dataset[df_dataset['category']=='Neutral']
df_dataset_not_malayalam_words = df_dataset[df_dataset['category']=='Not-Malayalam']

print(df_dataset_positive_words.shape)
print(df_dataset_negative_words.shape)
print(df_dataset_mixed_feeling_words.shape)
print(df_dataset_neutral_words.shape)
print(df_dataset_not_malayalam_words.shape)

(2811, 2)
(738, 2)
(403, 2)
(1903, 2)
(884, 2)


In [36]:
# Creating dataframes for (df_train) all categories for later use

df_train_positive_words = df_train[df_train['category']=='Positive']
df_train_negative_words = df_train[df_train['category']=='Negative']
df_train_mixed_feeling_words = df_train[df_train['category']=='Mixed_feelings']
df_train_neutral_words = df_train[df_train['category']=='Neutral']
df_train_not_malayalam_words = df_train[df_train['category']=='Not-Malayalam']

print(df_train_positive_words.shape)
print(df_train_negative_words.shape)
print(df_train_mixed_feeling_words.shape)
print(df_train_neutral_words.shape)
print(df_train_not_malayalam_words.shape)

(2022, 2)
(549, 2)
(289, 2)
(1344, 2)
(647, 2)


In [37]:
# Creating dataframes for (df_test) all categories for later use

df_test_positive_words = df_test[df_test['category']=='Positive']
df_test_negative_words = df_test[df_test['category']=='Negative']
df_test_mixed_feeling_words = df_test[df_test['category']=='Mixed_feelings']
df_test_neutral_words = df_test[df_test['category']=='Neutral']
df_test_not_malayalam_words = df_test[df_test['category']=='Not-Malayalam']

print(df_test_positive_words.shape)
print(df_test_negative_words.shape)
print(df_test_mixed_feeling_words.shape)
print(df_test_neutral_words.shape)
print(df_test_not_malayalam_words.shape)

(565, 2)
(138, 2)
(70, 2)
(398, 2)
(177, 2)


In [39]:
# Creating dataframes for (df_val) all categories for later use

df_val_positive_words = df_val[df_val['category']=='Positive']
df_val_negative_words = df_val[df_val['category']=='Negative']
df_val_mixed_feeling_words = df_val[df_val['category']=='Mixed_feelings']
df_val_neutral_words = df_val[df_val['category']=='Neutral']
df_val_not_malayalam_words = df_val[df_val['category']=='Not-Malayalam']

print(df_val_positive_words.shape)
print(df_val_negative_words.shape)
print(df_val_mixed_feeling_words.shape)
print(df_val_neutral_words.shape)
print(df_val_not_malayalam_words.shape)

(224, 2)
(51, 2)
(44, 2)
(161, 2)
(60, 2)


# Descrpancies Found

In [None]:

# Database is imbalanced, nothing has been done to rectify this.

value_counts = df_dataset['category'].value_counts()
print(value_counts)

In [48]:
# Pre processing is not done as mentioned in the paper.

# We preprocessed the comments by removing the emoji’s,
# and sentence length longer than 15 or less than 5 words since
# sentence more than 15 words will be difficult for annotators.
# After cleaning, we got 6,738 sentences for Malayalam-English
# code-mixed post comments.

# Function to count words in a sentence
def count_words(sentence):
    return len(sentence.split())

# Apply the function and filter the DataFrame
df_wordsGreater15 = df_dataset[df_dataset['text'].apply(lambda x: count_words(x) > 15)]
df_wordsLess5 = df_dataset[df_dataset['text'].apply(lambda x: count_words(x) < 5)]


# Function to check if a sentence contains any emoji
def contains_emoji(sentence):
    for character in sentence:
        if emoji.is_emoji(character):
            return True
    return False

# Apply the function and count sentences with emojis
df_emojis = df_dataset[df_dataset['text'].apply(contains_emoji)]

print(str(df_wordsGreater15.shape[0]) + " comments have more than 15 words") #236 sentences have more than 15 words
print(str(df_wordsLess5.shape[0]) + " comments have less than 5 words") #346 sentences have less than 5 words
print(str(df_emojis.shape[0]) + " sentences have emojis in them") #309 sentences have emojis in them
print(str(df_wordsGreater15.shape[0]+df_wordsLess5.shape[0]+df_emojis.shape[0]) + " sentences were not preprocessed")

236 comments have more than 15 words
346 comments have less than 5 words
309 sentences have emojis in them
891 sentences were not preprocessed


# Computing the statistics using Tokenize from NLTK

In [68]:
all_words = set()
def analyze_text(text):
    # Tokenizes the text into sentences
    sentences = sent_tokenize(text)
    sentence_count = len(sentences)

    # Tokenizes the text into words
    words = word_tokenize(text)
    word_count = len(words)

    # Computes the vocabulary size
    # vocabulary_size = len(set(words))
    for i in words:
        all_words.add(i)

    # Calculates average sentence length in terms of words
    avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0

    return sentence_count, word_count, avg_sentence_length

# Apply the analysis function to each text entry in the DataFrame
analysis_results = df_dataset['text'].apply(lambda x: analyze_text(x))

# Unpack the results into separate columns
df_dataset['sentence_count'], df_dataset['word_count'],  df_dataset['avg_sentence_length'] = zip(*analysis_results)

# To find the total number of tokens, you can sum up the word counts
total_tokens = df_dataset['word_count'].sum()

# total sentence count
total_sentence_count = df_dataset['sentence_count'].sum()

#total word_count
total_word_count = df_dataset['word_count'].sum()

#total vocabulary
# total_vocabulary_count = df_dataset['vocabulary_size'].sum()

#avg sentence_length
avg_sentence_length = df_dataset['avg_sentence_length'].mean()

# avg sentence per post
avg_sentence_per_comment = df_dataset['sentence_count'].mean()

# Display the overall analysis results and the updated DataFrame
print(f"Total number of tokens: {total_tokens}")
print(f"Total number of sentences: {total_sentence_count}")
print(f"Total number of words: {total_word_count}")
print(f"Vocabulary Size: {len(all_words)}")
print(f"Average sentence length: {avg_sentence_length}")
print(f"Average sentence per comment: {avg_sentence_per_comment}")

# print(df_dataset.head())  # Adjust according to how you wish to view the results


Total number of tokens: 61022
Total number of sentences: 7787
Total number of words: 61022
Vocabulary Size: 19389
Average sentence length: 8.265447395756048
Average sentence per comment: 1.1555126873423356


In [69]:
df_dataset.value_counts('category')

category
Positive          2811
Neutral           1903
Not-Malayalam      884
Negative           738
Mixed_feelings     403
dtype: int64

# Computing the statistics using SPACY

In [58]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting spacy
  Obtaining dependency information for spacy from https://files.pythonhosted.org/packages/ca/f3/609bb7512cad1f02af13daa23aa433b931da34c502211f29fd47dceff624/spacy-3.7.2-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading spacy-3.7.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (25 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Obtaining dependency information for spacy-loggers<2.0.0,>=1.0.0 from https://files.pythonhosted.org/packages/33/78/d1a1a026ef3af911159398c939b1509d5c36fe524c7b644f34a5146c4e16/spacy_loggers-1.0.5-py3-none-any.whl.metadata
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Obtaining dependency information for murmurhash<1.1.0,>=0.28.0 from https://files.pythonhosted.org/packages/7a/05/4a3b5c3043c6d84c00bf0f574d326660702b1c10174fe6b44cef3c3dff08/murmurh

In [60]:
import pandas as pd
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Define a function to analyze the text
def analyze_text_spacy(text):
    doc = nlp(text)
    
    # Sentence count
    sentence_count = len(list(doc.sents))
    
    # Word count (excluding punctuations)
    word_count = len([token for token in doc if not token.is_punct])
    
    # Vocabulary size (unique words, excluding punctuations)
    vocabulary_size = len(set([token.text.lower() for token in doc if not token.is_punct]))
    
    # Average sentence length
    avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
    
    return sentence_count, word_count, vocabulary_size, avg_sentence_length

# Apply the analysis function to each row in the 'text' column
df_dataset[['sentence_count', 'word_count', 'vocabulary_size', 'avg_sentence_length']] = df_dataset['text'].apply(
    lambda x: pd.Series(analyze_text_spacy(x))
)

# To find the total number of tokens, you can sum up the word counts
total_tokens = df_dataset['word_count'].sum()

# total sentence count
total_sentence_count = df_dataset['sentence_count'].sum()

#total word_count
total_word_count = df_dataset['word_count'].sum()

#total vocabulary
total_vocabulary_count = df_dataset['vocabulary_size'].sum()

#avg sentence_length
avg_sentence_length = df_dataset['avg_sentence_length'].mean()

# Display the overall analysis results and the updated DataFrame
print(f"Total number of tokens: {total_tokens}")
print(f"Total number of sentences: {total_sentence_count}")
print(f"Total number of words: {total_word_count}")
print(f"Vocabulary Size: {total_vocabulary_count}")
print(f"Average sentence length: {avg_sentence_length}")

# print(df_dataset.head())  # Adjust according to how you wish t

Total number of tokens: 63104.0
Total number of sentences: 9051.0
Total number of words: 63104.0
Vocabulary Size: 59758.0
Average sentence length: 7.813441658010585


In [61]:
!pip install textblob

import nltk
nltk.download('punkt')


Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m636.8/636.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: textblob
Successfully installed textblob-0.17.1


[nltk_data] Downloading package punkt to /Users/sachin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [62]:
from textblob import TextBlob

# Function to analyze text
def analyze_text_with_textblob(text):
    blob = TextBlob(text)
    
    # Sentence count
    sentence_count = len(blob.sentences)
    
    # Word count (including punctuation)
    word_count = len(blob.words)
    
    # Vocabulary size (unique words, including punctuation)
    vocabulary = set(blob.words)
    vocabulary_size = len(vocabulary)
    
    # Average sentence length in terms of words
    avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
    
    return sentence_count, word_count, vocabulary_size, avg_sentence_length

# Assuming 'df' is your DataFrame and it has a column 'text' with the text you want to analyze
results = df_dataset['text'].apply(lambda x: analyze_text_with_textblob(x))

# Unpack the results into separate columns
df_dataset['sentence_count'], df_dataset['word_count'], df_dataset['vocabulary_size'], df_dataset['avg_sentence_length'] = zip(*results)

# To find the total number of tokens, you can sum up the word counts
total_tokens = df_dataset['word_count'].sum()

# total sentence count
total_sentence_count = df_dataset['sentence_count'].sum()

#total word_count
total_word_count = df_dataset['word_count'].sum()

#total vocabulary
total_vocabulary_count = df_dataset['vocabulary_size'].sum()

#avg sentence_length
avg_sentence_length = df_dataset['avg_sentence_length'].mean()

# Display the overall analysis results and the updated DataFrame
print(f"Total number of tokens: {total_tokens}")
print(f"Total number of sentences: {total_sentence_count}")
print(f"Total number of words: {total_word_count}")
print(f"Vocabulary Size: {total_vocabulary_count}")
print(f"Average sentence length: {avg_sentence_length}")


Total number of tokens: 55107
Total number of sentences: 7787
Total number of words: 55107
Vocabulary Size: 53904
Average sentence length: 7.5706718468255785
