In [36]:
!pip -q install pandas openpyxl

In [37]:
import pandas as pd
import re
import json

In [38]:
train = pd.read_csv("malayalam_train.tsv", sep="\t")
dev = pd.read_csv("malayalam_dev.tsv", sep="\t")
test = pd.read_excel("malayalam_test_results.xlsx")

print("Train shape:", train.shape)
print("Dev shape:", dev.shape)
print("Test shape:", test.shape)

print("Train columns:", list(train.columns))
print("Dev columns:", list(dev.columns))
print("Test columns:", list(test.columns))

train.head()

Train shape: (4851, 2)
Dev shape: (540, 2)
Test shape: (1348, 3)
Train columns: ['text', 'category']
Dev columns: ['text', 'category']
Test columns: ['id', 'text', 'category']


Unnamed: 0,text,category
0,hoo mammokka police vesham aaha anthas,Positive
1,Oru rekshayum illa...kidilam kannu nananjupoyi,Positive
2,Ikka waiting.........,Positive
3,Raju Ettante Oro Shorttum Ijathi ppwli,Positive
4,Ettan fansil netti poya aarenkilum undo? #...,Positive


In [39]:
print(train.columns)
train.head()

Index(['text', 'category'], dtype='object')


Unnamed: 0,text,category
0,hoo mammokka police vesham aaha anthas,Positive
1,Oru rekshayum illa...kidilam kannu nananjupoyi,Positive
2,Ikka waiting.........,Positive
3,Raju Ettante Oro Shorttum Ijathi ppwli,Positive
4,Ettan fansil netti poya aarenkilum undo? #...,Positive


In [40]:
print(dev.columns)
print(test.columns)

Index(['text', 'category'], dtype='object')
Index(['id', 'text', 'category'], dtype='object')


In [41]:
train = train[["text", "category"]]
dev = dev[["text", "category"]]
test = test[["text", "category"]]

In [42]:
train = train.dropna()
dev = dev.dropna()
test = test.dropna()

In [43]:
train = train.drop_duplicates()
dev = dev.drop_duplicates()
test = test.drop_duplicates()

In [44]:
def clean_text(text):
    text = str(text).strip()
    text = re.sub(r"\s+", " ", text)              # normalize spaces
    text = re.sub(r"http\S+|www\S+", "", text)    # remove URLs
    text = re.sub(r"@\w+", "", text)              # remove mentions
    text = re.sub(r"!{2,}", "!!", text)           # limit !!! but keep emotion
    text = re.sub(r"\?{2,}", "??", text)          # limit ??? but keep emotion
    return text.strip()

train["text"] = train["text"].apply(clean_text)
dev["text"] = dev["text"].apply(clean_text)
test["text"] = test["text"].apply(clean_text)

train.head()

Unnamed: 0,text,category
0,hoo mammokka police vesham aaha anthas,Positive
1,Oru rekshayum illa...kidilam kannu nananjupoyi,Positive
2,Ikka waiting.........,Positive
3,Raju Ettante Oro Shorttum Ijathi ppwli,Positive
4,Ettan fansil netti poya aarenkilum undo? #madu...,Positive


In [45]:
train["category"] = train["category"].astype(str).str.strip()
dev["category"] = dev["category"].astype(str).str.strip()
test["category"] = test["category"].astype(str).str.strip()

sorted([repr(x) for x in train["category"].unique()])

["'Mixed_feelings'",
 "'Negative'",
 "'Positive'",
 "'not-malayalam'",
 "'unknown_state'"]

In [46]:
train["category"].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
Positive,2018
unknown_state,1340
not-malayalam,646
Negative,548
Mixed_feelings,288


In [47]:
label_map = {
    "Positive": 0,
    "unknown_state": 1,
    "not-malayalam": 2,
    "Negative": 3,
    "Mixed_feelings": 4
}

train["label_id"] = train["category"].map(label_map)
dev["label_id"] = dev["category"].map(label_map)
test["label_id"] = test["category"].map(label_map)

In [48]:
train[train["label_id"].isna()]

Unnamed: 0,text,category,label_id


In [49]:
train["category"].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
Positive,2018
unknown_state,1340
not-malayalam,646
Negative,548
Mixed_feelings,288


In [50]:
train[train["label_id"].isna()]

Unnamed: 0,text,category,label_id


In [51]:
train["label_id"].value_counts()

Unnamed: 0_level_0,count
label_id,Unnamed: 1_level_1
0,2018
1,1340
2,646
3,548
4,288


In [52]:
counts = train["category"].value_counts()
percent = train["category"].value_counts(normalize=True) * 100

summary = pd.concat([counts, percent], axis=1)
summary.columns = ["count", "percent"]
summary

Unnamed: 0_level_0,count,percent
category,Unnamed: 1_level_1,Unnamed: 2_level_1
Positive,2018,41.694215
unknown_state,1340,27.68595
not-malayalam,646,13.347107
Negative,548,11.322314
Mixed_feelings,288,5.950413


In [53]:
train.to_csv("clean_train.csv", index=False)
dev.to_csv("clean_dev.csv", index=False)
test.to_csv("clean_test.csv", index=False)

In [54]:
import json

with open("label_map.json", "w") as f:
    json.dump(label_map, f)

print("Saved label_map.json")

Saved label_map.json


In [55]:
print(train.shape, dev.shape, test.shape)
print(train.dtypes)
print(train.sample(3))

(4840, 3) (540, 3) (1347, 3)
text        object
category    object
label_id     int64
dtype: object
                                                   text       category  \
4391  Adi kapyare koottamani.....box office chattu.....       Positive   
1226  Gilla plus loham plus ramaleelaa...naariya kop...       Negative   
1681             DIDNT ONG - BAK GET MADE ALREADY > :-/  not-malayalam   

      label_id  
4391         0  
1226         3  
1681         2  


In [56]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [57]:
!mkdir -p /content/drive/MyDrive/Project2_preprocessing_mal_en

In [58]:
import re

def english_word_ratio(s: str) -> float:
    s = str(s)
    tokens = s.split()
    if len(tokens) == 0:
        return 0.0
    english_like = sum(1 for w in tokens if bool(re.fullmatch(r"[A-Za-z]+", w)))
    return english_like / len(tokens)

In [59]:
for df in [train, dev, test]:
    df["english_word_ratio"] = df["text"].apply(english_word_ratio)

train[["text", "category", "english_word_ratio"]].head()

Unnamed: 0,text,category,english_word_ratio
0,hoo mammokka police vesham aaha anthas,Positive,1.0
1,Oru rekshayum illa...kidilam kannu nananjupoyi,Positive,0.8
2,Ikka waiting.........,Positive,0.5
3,Raju Ettante Oro Shorttum Ijathi ppwli,Positive,1.0
4,Ettan fansil netti poya aarenkilum undo? #madu...,Positive,0.625


In [60]:
train.groupby("category")["english_word_ratio"].mean().sort_values(ascending=False)

Unnamed: 0_level_0,english_word_ratio
category,Unnamed: 1_level_1
Mixed_feelings,0.891514
Negative,0.88612
unknown_state,0.871359
Positive,0.867232
not-malayalam,0.822796


In [61]:
def codemix_bucket(r):
    if 0.20 <= r <= 0.80:
        return "high_codemix"
    return "low_codemix"

train["codemix_bucket"] = train["english_word_ratio"].apply(codemix_bucket)
train["codemix_bucket"].value_counts()

Unnamed: 0_level_0,count
codemix_bucket,Unnamed: 1_level_1
low_codemix,3546
high_codemix,1294


In [62]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(train["label_id"])

weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=train["label_id"]
)

class_weights = {int(cls): float(w) for cls, w in zip(classes, weights)}

class_weights

{0: 0.4796828543111992,
 1: 0.7223880597014926,
 2: 1.498452012383901,
 3: 1.7664233576642336,
 4: 3.361111111111111}

In [63]:
with open("class_weights.json", "w") as f:
    json.dump(class_weights, f)

print("class_weights.json saved locally")

class_weights.json saved locally


In [64]:
train.to_csv("/content/drive/MyDrive/Project2_preprocessing_mal_en/clean_train.csv", index=False)
dev.to_csv("/content/drive/MyDrive/Project2_preprocessing_mal_en/clean_dev.csv", index=False)
test.to_csv("/content/drive/MyDrive/Project2_preprocessing_mal_en/clean_test.csv", index=False)

import json
with open("/content/drive/MyDrive/Project2_preprocessing_mal_en/label_map.json", "w") as f:
    json.dump(label_map, f)

with open("/content/drive/MyDrive/Project2_preprocessing_mal_en/class_weights.json", "w") as f:
    json.dump(class_weights, f)

print("All files saved successfully.")

All files saved successfully.


In [65]:
!ls -lh /content/drive/MyDrive/Project2_preprocessing_mal_en

total 608K
-rw------- 1 root root  123 Feb 25 13:07 class_weights.json
-rw------- 1 root root  45K Feb 25 13:07 clean_dev.csv
-rw------- 1 root root 111K Feb 25 13:07 clean_test.csv
-rw------- 1 root root 452K Feb 25 13:07 clean_train.csv
-rw------- 1 root root   91 Feb 25 13:07 label_map.json
