In [37]:
# import statements
import re
import pandas as pd
from sklearn.model_selection import train_test_split

In [38]:
# import dravidian dataset
drav_final = pd.read_csv("drav_final.csv")
drav_final

Unnamed: 0,tam,kan,mal,tel
0,peyar,hesaru,per,
1,namam,nama,namam,
2,vivaram,,vivaranam,
3,etirpatam,,viparitam,
4,karunai,karune,karuna,
...,...,...,...,...
7215,,,visadamaya,visadanga
7216,,,nisvarthanayi,nisvartham
7217,,,asamayam,asamayamlo
7218,,,niyamanusrtam,niyamanusaramuga


In [39]:
# remove all inputs with NaN in tamil column
tamil_data = drav_final[drav_final["tam"].notnull()]
tamil_data

Unnamed: 0,tam,kan,mal,tel
0,peyar,hesaru,per,
1,namam,nama,namam,
2,vivaram,,vivaranam,
3,etirpatam,,viparitam,
4,karunai,karune,karuna,
...,...,...,...,...
2754,ki.pi.,,,e.di
2755,katinamaka,kathinavagi,,kathinanga
2756,accariyamaka,ascaryakara,ascaryapurvam,ascaryakaramaina
2757,piratyekamaka,,pratyekamayi,pratyekanga


In [40]:
# connect multi-word inputs with _
tamil_data['kan'] = tamil_data['kan'].apply(lambda x: x.replace(" ", "") if isinstance(x, str) else x)
tamil_data['tel'] = tamil_data['tel'].apply(lambda x: x.replace(" ", "") if isinstance(x, str) else x)
tamil_data['mal'] = tamil_data['mal'].apply(lambda x: x.replace(" ", "") if isinstance(x, str) else x)
tamil_data['tam'] = tamil_data['tam'].apply(lambda x: x.replace(" ", "") if isinstance(x, str) else x)

tamil_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tamil_data['kan'] = tamil_data['kan'].apply(lambda x: x.replace(" ", "") if isinstance(x, str) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tamil_data['tel'] = tamil_data['tel'].apply(lambda x: x.replace(" ", "") if isinstance(x, str) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 

Unnamed: 0,tam,kan,mal,tel
0,peyar,hesaru,per,
1,namam,nama,namam,
2,vivaram,,vivaranam,
3,etirpatam,,viparitam,
4,karunai,karune,karuna,
...,...,...,...,...
2754,ki.pi.,,,e.di
2755,katinamaka,kathinavagi,,kathinanga
2756,accariyamaka,ascaryakara,ascaryapurvam,ascaryakaramaina
2757,piratyekamaka,,pratyekamayi,pratyekanga


In [41]:
# replace null values with NaN
tamil_data = tamil_data.fillna("NaN")
tamil_data

Unnamed: 0,tam,kan,mal,tel
0,peyar,hesaru,per,
1,namam,nama,namam,
2,vivaram,,vivaranam,
3,etirpatam,,viparitam,
4,karunai,karune,karuna,
...,...,...,...,...
2754,ki.pi.,,,e.di
2755,katinamaka,kathinavagi,,kathinanga
2756,accariyamaka,ascaryakara,ascaryapurvam,ascaryakaramaina
2757,piratyekamaka,,pratyekamayi,pratyekanga


In [42]:
def clean_line(line):
    return re.sub(r'[^a-z\s]', '', line) if line != "NaN" else "NaN"

In [43]:
# remove non-letter characters
tamil_data["tam"] = tamil_data["tam"].apply(clean_line)
tamil_data["kan"] = tamil_data["kan"].apply(clean_line)
tamil_data["mal"] = tamil_data["mal"].apply(clean_line)
tamil_data["tel"] = tamil_data["tel"].apply(clean_line)

tamil_data

Unnamed: 0,tam,kan,mal,tel
0,peyar,hesaru,per,
1,namam,nama,namam,
2,vivaram,,vivaranam,
3,etirpatam,,viparitam,
4,karunai,karune,karuna,
...,...,...,...,...
2754,kipi,,,edi
2755,katinamaka,kathinavagi,,kathinanga
2756,accariyamaka,ascaryakara,ascaryapurvam,ascaryakaramaina
2757,piratyekamaka,,pratyekamayi,pratyekanga


In [44]:
# separate into training, validation, and testing data
train, temp = train_test_split(tamil_data, test_size=0.2, random_state=42)
valid, test = train_test_split(temp, test_size=0.5, random_state=42)

train, valid, test

(              tam              kan             mal    tel
 2650        cinna              NaN             NaN  cinna
 261        kotari           kodali             NaN    NaN
 1610  aniyayamana         anyayada      anyayamaya    NaN
 1407   natakamana  natakiyavadanta             NaN    NaN
 1789       paravu           haradu             NaN    NaN
 ...           ...              ...             ...    ...
 1638   ekantamana              NaN      ekantamaya    NaN
 1095    camakonam         samakona             NaN    NaN
 1130        poran            boran           boron    NaN
 1294  kopamillata  krodavilladanta  amarsamillatta    NaN
 860   velaikkarar              NaN      velakkaran    NaN
 
 [2207 rows x 4 columns],
                             tam                 kan  \
 210                   kattukari                 NaN   
 1517          ataiyalamteriyata                 NaN   
 2142                   arimukam                 NaN   
 867                  culalkarru        

In [45]:
# combine kannada, malayalam, and telugu into one word
def combine_inputs(row):
    combined = []
    for i in range(max(len(row["kan"]), len(row["mal"]), len(row["tel"]))):
        if i < len(row["kan"]):
            if row["kan"] != "NaN":
                combined.append(row["kan"][i])

        if i < len(row["mal"]):
            if row["mal"] != "NaN":
                combined.append(row["mal"][i])

        if i < len(row["tel"]):
            if row["tel"] != "NaN":
                combined.append(row["tel"][i])

        combined.append(" ")

    return "".join(combined)

In [46]:
# split tamil by spaces
def split_by_space(row):
    return " ". join(list(row))

In [47]:
# training data
train_inputs = train.apply(combine_inputs, axis=1)
train_targets = train['tam'].apply(split_by_space)

# validation data
valid_inputs = valid.apply(combine_inputs, axis=1)
valid_targets = valid['tam'].apply(split_by_space)

# testing data
test_inputs = test.apply(combine_inputs, axis=1)
test_targets = test['tam'].apply(split_by_space)

In [48]:
# save files
train_inputs.to_csv('..\\data\\trial4\\train_inputs.txt', index=False, header=False)
train_targets.to_csv('..\\data\\trial4\\train_targets.txt', index=False, header=False)

valid_inputs.to_csv('..\\data\\trial4\\valid_inputs.txt', index=False, header=False)
valid_targets.to_csv('..\\data\\trial4\\valid_targets.txt', index=False, header=False)

test_inputs.to_csv('..\\data\\trial4\\test_inputs.txt', index=False, header=False)
test_targets.to_csv('..\\data\\trial4\\test_targets.txt', index=False, header=False)

In [51]:
sing = list("abcdefghijklmnopqrstuvwxyz")
sing

['a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [None]:
dub = []
for char in sing:
    for char2 in sing:
        dub.append(char + char2)

dub

['aa',
 'ab',
 'ac',
 'ad',
 'ae',
 'af',
 'ag',
 'ah',
 'ai',
 'aj',
 'ak',
 'al',
 'am',
 'an',
 'ao',
 'ap',
 'aq',
 'ar',
 'as',
 'at',
 'au',
 'av',
 'aw',
 'ax',
 'ay',
 'az',
 'ba',
 'bb',
 'bc',
 'bd',
 'be',
 'bf',
 'bg',
 'bh',
 'bi',
 'bj',
 'bk',
 'bl',
 'bm',
 'bn',
 'bo',
 'bp',
 'bq',
 'br',
 'bs',
 'bt',
 'bu',
 'bv',
 'bw',
 'bx',
 'by',
 'bz',
 'ca',
 'cb',
 'cc',
 'cd',
 'ce',
 'cf',
 'cg',
 'ch',
 'ci',
 'cj',
 'ck',
 'cl',
 'cm',
 'cn',
 'co',
 'cp',
 'cq',
 'cr',
 'cs',
 'ct',
 'cu',
 'cv',
 'cw',
 'cx',
 'cy',
 'cz',
 'da',
 'db',
 'dc',
 'dd',
 'de',
 'df',
 'dg',
 'dh',
 'di',
 'dj',
 'dk',
 'dl',
 'dm',
 'dn',
 'do',
 'dp',
 'dq',
 'dr',
 'ds',
 'dt',
 'du',
 'dv',
 'dw',
 'dx',
 'dy',
 'dz',
 'ea',
 'eb',
 'ec',
 'ed',
 'ee',
 'ef',
 'eg',
 'eh',
 'ei',
 'ej',
 'ek',
 'el',
 'em',
 'en',
 'eo',
 'ep',
 'eq',
 'er',
 'es',
 'et',
 'eu',
 'ev',
 'ew',
 'ex',
 'ey',
 'ez',
 'fa',
 'fb',
 'fc',
 'fd',
 'fe',
 'ff',
 'fg',
 'fh',
 'fi',
 'fj',
 'fk',
 'fl',
 'fm',

In [53]:
trip = []
for chars in dub:
    for char in sing:
        trip.append(chars + char)

trip

['aaa',
 'aab',
 'aac',
 'aad',
 'aae',
 'aaf',
 'aag',
 'aah',
 'aai',
 'aaj',
 'aak',
 'aal',
 'aam',
 'aan',
 'aao',
 'aap',
 'aaq',
 'aar',
 'aas',
 'aat',
 'aau',
 'aav',
 'aaw',
 'aax',
 'aay',
 'aaz',
 'aba',
 'abb',
 'abc',
 'abd',
 'abe',
 'abf',
 'abg',
 'abh',
 'abi',
 'abj',
 'abk',
 'abl',
 'abm',
 'abn',
 'abo',
 'abp',
 'abq',
 'abr',
 'abs',
 'abt',
 'abu',
 'abv',
 'abw',
 'abx',
 'aby',
 'abz',
 'aca',
 'acb',
 'acc',
 'acd',
 'ace',
 'acf',
 'acg',
 'ach',
 'aci',
 'acj',
 'ack',
 'acl',
 'acm',
 'acn',
 'aco',
 'acp',
 'acq',
 'acr',
 'acs',
 'act',
 'acu',
 'acv',
 'acw',
 'acx',
 'acy',
 'acz',
 'ada',
 'adb',
 'adc',
 'add',
 'ade',
 'adf',
 'adg',
 'adh',
 'adi',
 'adj',
 'adk',
 'adl',
 'adm',
 'adn',
 'ado',
 'adp',
 'adq',
 'adr',
 'ads',
 'adt',
 'adu',
 'adv',
 'adw',
 'adx',
 'ady',
 'adz',
 'aea',
 'aeb',
 'aec',
 'aed',
 'aee',
 'aef',
 'aeg',
 'aeh',
 'aei',
 'aej',
 'aek',
 'ael',
 'aem',
 'aen',
 'aeo',
 'aep',
 'aeq',
 'aer',
 'aes',
 'aet',
 'aeu',


In [55]:
file = open("..\data\\trial4\\vocab_src.txt", "a")

for i in (sing + dub + trip):
    file.write(i + "\n")

file.close()