In [105]:
!pip -q install pandas openpyxl

In [106]:
import pandas as pd
import re
import json

In [107]:
train = pd.read_csv("malayalam_train.tsv", sep="\t")
dev = pd.read_csv("malayalam_dev.tsv", sep="\t")
test = pd.read_excel("malayalam_test_results.xlsx")

print("Train shape:", train.shape)
print("Dev shape:", dev.shape)
print("Test shape:", test.shape)

print("Train columns:", list(train.columns))
print("Dev columns:", list(dev.columns))
print("Test columns:", list(test.columns))

train.head()

Train shape: (4851, 2)
Dev shape: (540, 2)
Test shape: (1348, 3)
Train columns: ['text', 'category']
Dev columns: ['text', 'category']
Test columns: ['id', 'text', 'category']


Unnamed: 0,text,category
0,hoo mammokka police vesham aaha anthas,Positive
1,Oru rekshayum illa...kidilam kannu nananjupoyi,Positive
2,Ikka waiting.........,Positive
3,Raju Ettante Oro Shorttum Ijathi ppwli,Positive
4,Ettan fansil netti poya aarenkilum undo? #...,Positive


In [108]:
print(train.columns)
train.head()

Index(['text', 'category'], dtype='object')


Unnamed: 0,text,category
0,hoo mammokka police vesham aaha anthas,Positive
1,Oru rekshayum illa...kidilam kannu nananjupoyi,Positive
2,Ikka waiting.........,Positive
3,Raju Ettante Oro Shorttum Ijathi ppwli,Positive
4,Ettan fansil netti poya aarenkilum undo? #...,Positive


In [109]:
print(dev.columns)
print(test.columns)

Index(['text', 'category'], dtype='object')
Index(['id', 'text', 'category'], dtype='object')


In [110]:
train = train[["text", "category"]]
dev = dev[["text", "category"]]
test = test[["text", "category"]]

In [111]:
train = train.dropna()
dev = dev.dropna()
test = test.dropna()

In [112]:
train = train.drop_duplicates()
dev = dev.drop_duplicates()
test = test.drop_duplicates()

In [113]:
def clean_text(text):
    text = str(text).strip()
    text = re.sub(r"\s+", " ", text)              # normalize spaces
    text = re.sub(r"http\S+|www\S+", "", text)    # remove URLs
    text = re.sub(r"@\w+", "", text)              # remove mentions
    text = re.sub(r"!{2,}", "!!", text)           # limit !!! but keep emotion
    text = re.sub(r"\?{2,}", "??", text)          # limit ??? but keep emotion
    return text.strip()

train["text"] = train["text"].apply(clean_text)
dev["text"] = dev["text"].apply(clean_text)
test["text"] = test["text"].apply(clean_text)

train.head()

Unnamed: 0,text,category
0,hoo mammokka police vesham aaha anthas,Positive
1,Oru rekshayum illa...kidilam kannu nananjupoyi,Positive
2,Ikka waiting.........,Positive
3,Raju Ettante Oro Shorttum Ijathi ppwli,Positive
4,Ettan fansil netti poya aarenkilum undo? #madu...,Positive


In [114]:
train["category"] = train["category"].astype(str).str.strip()
dev["category"] = dev["category"].astype(str).str.strip()
test["category"] = test["category"].astype(str).str.strip()

sorted([repr(x) for x in train["category"].unique()])

["'Mixed_feelings'",
 "'Negative'",
 "'Positive'",
 "'not-malayalam'",
 "'unknown_state'"]

In [115]:
train["category"].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
Positive,2018
unknown_state,1340
not-malayalam,646
Negative,548
Mixed_feelings,288


In [116]:
label_map = {
    "Positive": 0,
    "unknown_state": 1,
    "not-malayalam": 2,
    "Negative": 3,
    "Mixed_feelings": 4
}

train["label_id"] = train["category"].map(label_map)
dev["label_id"] = dev["category"].map(label_map)
test["label_id"] = test["category"].map(label_map)

In [117]:
train[train["label_id"].isna()]

Unnamed: 0,text,category,label_id


In [118]:
train["category"].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
Positive,2018
unknown_state,1340
not-malayalam,646
Negative,548
Mixed_feelings,288


In [119]:
train[train["label_id"].isna()]

Unnamed: 0,text,category,label_id


In [120]:
train["label_id"].value_counts()

Unnamed: 0_level_0,count
label_id,Unnamed: 1_level_1
0,2018
1,1340
2,646
3,548
4,288


In [121]:
counts = train["category"].value_counts()
percent = train["category"].value_counts(normalize=True) * 100

summary = pd.concat([counts, percent], axis=1)
summary.columns = ["count", "percent"]
summary

Unnamed: 0_level_0,count,percent
category,Unnamed: 1_level_1,Unnamed: 2_level_1
Positive,2018,41.694215
unknown_state,1340,27.68595
not-malayalam,646,13.347107
Negative,548,11.322314
Mixed_feelings,288,5.950413


In [122]:
train.to_csv("clean_train.csv", index=False)
dev.to_csv("clean_dev.csv", index=False)
test.to_csv("clean_test.csv", index=False)

In [123]:
import json

with open("label_map.json", "w") as f:
    json.dump(label_map, f)

print("Saved label_map.json")

Saved label_map.json


In [124]:
print(train.shape, dev.shape, test.shape)
print(train.dtypes)
print(train.sample(3))

(4840, 3) (540, 3) (1347, 3)
text        object
category    object
label_id     int64
dtype: object
                                                   text       category  \
1452     padam 2 thavana already kandavar aarelum undoo  unknown_state   
4204  Ithu dislike adicha pookavadi makkalodu onne p...       Negative   
202             Ayye poraaa.... lalettan nu role ille??       Negative   

      label_id  
1452         1  
4204         3  
202          3  


In [125]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [126]:
!mkdir -p /content/drive/MyDrive/Project2_preprocessing_mal_en

In [127]:
train.to_csv("/content/drive/MyDrive/Project2_preprocessing_mal_en/clean_train.csv", index=False)
dev.to_csv("/content/drive/MyDrive/Project2_preprocessing_mal_en/clean_dev.csv", index=False)
test.to_csv("/content/drive/MyDrive/Project2_preprocessing_mal_en/clean_test.csv", index=False)

import json
with open("/content/drive/MyDrive/Project2_preprocessing_mal_en/label_map.json", "w") as f:
    json.dump(label_map, f)

In [128]:
!ls -lh /content/drive/MyDrive/Project2_preprocessing_mal_en

total 484K
-rw------- 1 root root  39K Feb 25 10:31 clean_dev.csv
-rw------- 1 root root  97K Feb 25 10:31 clean_test.csv
-rw------- 1 root root 347K Feb 25 10:31 clean_train.csv
-rw------- 1 root root   91 Feb 25 10:31 label_map.json
