In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

<p style = "text-align: right; font-size: 13px; color: DarkBlue"> Irina Nedyalkova

<p style="font-size: 24px; text-align: center; color: DarkBlue" > Robots Talk

There are five robots here with the extraordinary names 0, 1, 2, 3 and 4. They speak in numbers. Each robot speaks 100000 times a series of 10 numbers. The task is to train a model which can predict the robot when given the 10 numbers, spoken by it, with a good accuracy.

In [3]:
robots_talk = pd.read_csv("conversation_sequences.csv")

In [4]:
robots_talk

Unnamed: 0,source,num1,num2,num3,num4,num5,num6,num7,num8,num9,num10
0,0,2,5,2,5,6,2,1,8,1,3
1,1,1747,1749,1751,1758,1765,1767,1772,1774,1783,1785
2,2,65056,195168,1561344,7806720,31226880,187361280,749445120,6745006080,6745006080,6745006080
3,3,2855,2860,2865,2870,2875,2880,2885,2890,2895,2900
4,4,11440,57200,286000,1430000,7150000,35750000,178750000,893750000,4468750000,22343750000
...,...,...,...,...,...,...,...,...,...,...,...
499995,0,8,1,7,5,1,3,2,4,4,5
499996,1,4882,4886,4888,4896,4902,4903,4907,4912,4920,4924
499997,2,32796,32796,295164,2656476,7969428,31877712,159388560,159388560,318777120,2231439840
499998,3,7751,7756,7761,7766,7771,7776,7781,7786,7791,7796


I decide to rename the source column for more clarity:

In [5]:
robots_talk = robots_talk.rename(columns = {"source": "robot"})  

Target are the 5 robots:

In [6]:
robots_talk

Unnamed: 0,robot,num1,num2,num3,num4,num5,num6,num7,num8,num9,num10
0,0,2,5,2,5,6,2,1,8,1,3
1,1,1747,1749,1751,1758,1765,1767,1772,1774,1783,1785
2,2,65056,195168,1561344,7806720,31226880,187361280,749445120,6745006080,6745006080,6745006080
3,3,2855,2860,2865,2870,2875,2880,2885,2890,2895,2900
4,4,11440,57200,286000,1430000,7150000,35750000,178750000,893750000,4468750000,22343750000
...,...,...,...,...,...,...,...,...,...,...,...
499995,0,8,1,7,5,1,3,2,4,4,5
499996,1,4882,4886,4888,4896,4902,4903,4907,4912,4920,4924
499997,2,32796,32796,295164,2656476,7969428,31877712,159388560,159388560,318777120,2231439840
499998,3,7751,7756,7761,7766,7771,7776,7781,7786,7791,7796


See what columns/features I have to work with:

In [7]:
robots_talk.columns  

Index(['robot', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6', 'num7', 'num8',
       'num9', 'num10'],
      dtype='object')

#### This is the language of Robot 0:

In [8]:
robot_zero = robots_talk[(robots_talk["robot"] == 0)]  

In [9]:
robot_zero

Unnamed: 0,robot,num1,num2,num3,num4,num5,num6,num7,num8,num9,num10
0,0,2,5,2,5,6,2,1,8,1,3
5,0,4,9,5,4,8,6,6,7,7,2
10,0,1,6,5,8,8,8,2,6,9,6
15,0,7,5,3,3,8,9,8,6,7,2
20,0,3,7,9,2,5,8,8,7,5,4
...,...,...,...,...,...,...,...,...,...,...,...
499975,0,7,1,9,8,5,6,8,2,5,9
499980,0,8,1,5,6,2,5,4,7,3,8
499985,0,9,2,9,5,1,5,1,9,2,6
499990,0,8,5,6,2,1,4,3,7,6,8


#### Below the language of Robot 1:

In [10]:
robot_one = robots_talk[(robots_talk["robot"] == 1)]  

In [11]:
robot_one

Unnamed: 0,robot,num1,num2,num3,num4,num5,num6,num7,num8,num9,num10
1,1,1747,1749,1751,1758,1765,1767,1772,1774,1783,1785
6,1,1487,1491,1498,1503,1512,1514,1518,1522,1524,1528
11,1,5060,5068,5069,5075,5081,5085,5090,5091,5099,5106
16,1,1472,1475,1476,1484,1488,1490,1494,1500,1501,1507
21,1,9590,9592,9595,9599,9604,9609,9613,9620,9623,9628
...,...,...,...,...,...,...,...,...,...,...,...
499976,1,6994,7003,7006,7010,7013,7014,7019,7026,7030,7038
499981,1,2495,2503,2509,2512,2513,2517,2521,2526,2528,2533
499986,1,7353,7357,7364,7366,7374,7380,7382,7384,7388,7391
499991,1,8905,8911,8913,8920,8924,8925,8928,8936,8943,8947


#### Below the language of Robot 2:

In [12]:
robot_two = robots_talk[(robots_talk["robot"] == 2)]

In [13]:
robot_two

Unnamed: 0,robot,num1,num2,num3,num4,num5,num6,num7,num8,num9,num10
2,2,65056,195168,1561344,7806720,31226880,187361280,749445120,6745006080,6745006080,6745006080
7,2,18850,18850,113100,452400,452400,1809600,3619200,3619200,7238400,50668800
12,2,72369,434214,868428,4342140,39079260,351713340,703426680,1406853360,11254826880,56274134400
17,2,20636,41272,165088,990528,3962112,15848448,126787584,253575168,1521451008,6085804032
22,2,8418,42090,84180,589260,2946300,17677800,141422400,282844800,2262758400,2262758400
...,...,...,...,...,...,...,...,...,...,...,...
499977,2,17592,123144,1108296,9974664,19949328,39898656,119695968,957567744,3830270976,15321083904
499982,2,7202,50414,453726,2722356,2722356,2722356,2722356,2722356,19056492,95282460
499987,2,54486,272430,817290,4086450,4086450,16345800,147112200,1029785400,9268068600,83412617400
499992,2,26136,130680,130680,261360,522720,1045440,4181760,33454080,267632640,535265280


#### The language of Robot 3:

In [14]:
robot_three = robots_talk[(robots_talk["robot"] == 3)]  

In [15]:
robot_three

Unnamed: 0,robot,num1,num2,num3,num4,num5,num6,num7,num8,num9,num10
3,3,2855,2860,2865,2870,2875,2880,2885,2890,2895,2900
8,3,8962,8967,8972,8977,8982,8987,8992,8997,9002,9007
13,3,6270,6275,6280,6285,6290,6295,6300,6305,6310,6315
18,3,3463,3468,3473,3478,3483,3488,3493,3498,3503,3508
23,3,3067,3072,3077,3082,3087,3092,3097,3102,3107,3112
...,...,...,...,...,...,...,...,...,...,...,...
499978,3,5750,5755,5760,5765,5770,5775,5780,5785,5790,5795
499983,3,3207,3212,3217,3222,3227,3232,3237,3242,3247,3252
499988,3,1609,1614,1619,1624,1629,1634,1639,1644,1649,1654
499993,3,7472,7477,7482,7487,7492,7497,7502,7507,7512,7517


#### Language of the last Robot 4:

In [16]:
robot_four = robots_talk[(robots_talk["robot"] == 4)]

In [17]:
robot_four

Unnamed: 0,robot,num1,num2,num3,num4,num5,num6,num7,num8,num9,num10
4,4,11440,57200,286000,1430000,7150000,35750000,178750000,893750000,4468750000,22343750000
9,4,2870,14350,71750,358750,1793750,8968750,44843750,224218750,1121093750,5605468750
14,4,9270,46350,231750,1158750,5793750,28968750,144843750,724218750,3621093750,18105468750
19,4,28950,144750,723750,3618750,18093750,90468750,452343750,2261718750,11308593750,56542968750
24,4,19450,97250,486250,2431250,12156250,60781250,303906250,1519531250,7597656250,37988281250
...,...,...,...,...,...,...,...,...,...,...,...
499979,4,19030,95150,475750,2378750,11893750,59468750,297343750,1486718750,7433593750,37167968750
499984,4,35025,175125,875625,4378125,21890625,109453125,547265625,2736328125,13681640625,68408203125
499989,4,43690,218450,1092250,5461250,27306250,136531250,682656250,3413281250,17066406250,85332031250
499994,4,45170,225850,1129250,5646250,28231250,141156250,705781250,3528906250,17644531250,88222656250


You see, these 5 robots are speaking in numbers. Case of multi-class classification with time series features. They have some pattern of speech, and because of the pattern, I will sample them out.

In [18]:
attributes = robots_talk.drop(columns = ["robot"]).sample(5000)

In [19]:
target = robots_talk["robot"].sample(5000)

When we calculate the accuracy score of both, the train and the test data, and the values are comparable, the model is not overfitting.

<p style="text-align: left; color: gray"> work in progress...

_______________________________________________________________________________________________________________________________

References:
1. Kaggle dataset: https://www.kaggle.com/datasets/msk1097/classification-of-robots-from-their-conversation/