# Import Statements

In [43]:
import pandas as pd # Imported to enable the use of datastructures like dataframe
from sklearn.feature_extraction.text import TfidfVectorizer # Imported to convert raw documents into a matrix of tf idf features
from sklearn.linear_model import LogisticRegression # Imported to enable the use of logistic regression to classify text
from sklearn.model_selection import train_test_split # Imported to enable the user to split the data into train, test samples
from sklearn.metrics import classification_report, accuracy_score # Imported to calculate the accuracy and also print the classification report


# Importing the dataset

In [44]:
df = pd.read_csv('malayalam_train.tsv', sep='\t')
print(df)

                                                   text         category
0                hoo mammokka police vesham aaha anthas        Positive 
1        Oru rekshayum illa...kidilam kannu nananjupoyi        Positive 
2                             Ikka     waiting.........        Positive 
3                Raju Ettante Oro Shorttum Ijathi ppwli        Positive 
4      Ettan fansil netti poya aarenkilum undo?    #...        Positive 
...                                                 ...              ...
4846   Madhuraraja trailer Kand ivide vannanvar likkeee   unknown_state 
4847   Njn pru lalettan fan ahn..  eee trailer mass ...        Positive 
4848   Valiya pratheesha illa nalla entertainment  a...  Mixed_feelings 
4849   Dislike adikkunna kazhuthakalude mukhath adik...        Negative 
4850   Adipoli..... Pakshe oru sankadam ithinte thir...  Mixed_feelings 

[4851 rows x 2 columns]


# Preprocess the data (Add Later)

# Exploring the dataset

In [45]:
print(df.shape)
df.head()

(4851, 2)


Unnamed: 0,text,category
0,hoo mammokka police vesham aaha anthas,Positive
1,Oru rekshayum illa...kidilam kannu nananjupoyi,Positive
2,Ikka waiting.........,Positive
3,Raju Ettante Oro Shorttum Ijathi ppwli,Positive
4,Ettan fansil netti poya aarenkilum undo? #...,Positive


In [50]:

print(df.loc[df['category'] == 'Neutral'])

                                                   text category
7                1 day achu 5 million views kooda varla  Neutral
9                           I a m a katta mammookka fan  Neutral
13     Dislike adikan keri like adichath njaan mathr...  Neutral
14                 Enada Oru English comment kuda kanom  Neutral
20     Ithu  Verum unda alla Machinegun  Nirachum un...  Neutral
...                                                 ...      ...
4836   പഠിച്ച കള്ളനാ... Like അടി മക്കളെ... Frst time...  Neutral
4839   Trillernu vandi wait cheydhadh njan maathrama...  Neutral
4841   Padam kandathinte shesham trailer kannunna et...  Neutral
4843                       soubin fans like adi makkale  Neutral
4846   Madhuraraja trailer Kand ivide vannanvar likkeee  Neutral

[1344 rows x 2 columns]


In [48]:
# Renaming the classes

# Removing any leading/trailing spaces
df['category'] = df['category'].str.strip()

# Replacing 'unknown_state' with 'Irrelevant'
df['category'] = df['category'].replace({'unknown_state': 'Neutral'})
df['category'] = df['category'].replace({'Mixed_feelings': 'Mixed'})
df['category'] = df['category'].replace({'not-malayalam': 'Irrelevant'})

# Viewing the number of items in each class after replacement
print(df['category'].value_counts())

df.category.value_counts() # Used to view the number of items in each class.

## There is a significant imbalance in the classes in this dataset

Positive      2022
Neutral       1344
Irrelevant     647
Negative       549
Mixed          289
Name: category, dtype: int64


Positive      2022
Neutral       1344
Irrelevant     647
Negative       549
Mixed          289
Name: category, dtype: int64

### Renaming the columns for easiness

Positive      2022
Neutral       1344
Irrelevant     647
Negative       549
Mixed          289
Name: category, dtype: int64


### Next we would need to map these classes into numbers for the machine learning model

In [None]:
df.category.map({
    'Positive':0,
    'Negative':1,
    'unknown-state':2,
    ''
})

In [None]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(
    df.text, 
    df.category,
    test_size=0.2, # 20 % of samples will be present in test dataset
    random_state=42)

# Vectorization using TF-IDF
v = TfidfVectorizer()   
X_train_tfidf = v.fit_transform(X_train)
X_test_tfidf = v.transform(X_test)

# Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train_tfidf, y_train)

# Predictions and evaluation
predictions = log_reg.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))
