In [1]:
from urllib.request import urlopen

data_set_url = 'https://static-content.springer.com/esm/art%3A10.1038%2Fncomms5212/MediaObjects/41467_2014_BFncomms5212_MOESM1045_ESM.txt'
data = urlopen(data_set_url)

In [2]:
my_data = []
for line in data:
    data_row = line.decode().rstrip()
    my_data.append([term for term in data_row.split('\t')])

In [3]:
import pandas as pd
df = pd.DataFrame.from_records(my_data[1:], columns = my_data[0])
df.columns

Index(['MeSH Symptom Term', 'MeSH Disease Term', 'PubMed occurrence',
       'TFIDF score'],
      dtype='object')

In [4]:
df.columns = ['symptom','disease','n','score']
df['symptom'] = df['symptom'].astype('category')
df['disease'] = df['disease'].astype('category')
df['n'] = df['n'].astype('int')
df['score'] = df['score'].astype('float')
df = df.sort_values(by=['disease', 'symptom', 'n', 'score'])
possible_diseases = set(df['disease'])
possible_symptoms = set(df['symptom'])
print('# Unique Diseases: {}'.format(len(possible_diseases)))
print('# Unique Symptoms: {}'.format(len(possible_symptoms)))

symptom    category
disease    category
n             int64
score       float64
dtype: object

In [7]:
# df.to_csv('symptom_disease_dataset.csv', index=False)

In [8]:
df['symptom_encod'] = df['symptom'].cat.codes
df['disease_encod'] = df['disease'].cat.codes

In [6]:
import pandas as pd
df = pd.read_csv('symptom_disease_dataset.csv')
df.head()

Unnamed: 0,symptom,disease,n,score
0,Language Development Disorders,22q11 Deletion Syndrome,1,2.486567
1,Mental Retardation,22q11 Deletion Syndrome,1,0.905447
2,Olfaction Disorders,22q11 Deletion Syndrome,1,2.28823
3,Respiratory Sounds,22q11 Deletion Syndrome,1,1.639269
4,Virilism,"46, XX Disorders of Sex Development",1,2.227056


In [18]:
n = len(df)
diseases = list(set(df['disease']))
diseases_csv = pd.DataFrame([['disease_id'] + diseases])
diseases_csv.to_csv('disease_ids.csv', index=False)

# Disease To Symptom Classifier

## Path To Completion

1. Reformat data-frame to have symp_1, ..., symp_n, disease_i for each row, with the values being the occurence of symptom j within 1 <= j <= n for disease i.

2. Create simulated training and testing data for each disease which match original data's original frequency distribution

3. Run random forests model on data

## Step 1 - Reformat Data-Frame

In [9]:
symptom_encod = set(df['symptom_encod'])
new_col_names = [""]*len(symptom_encod)

for i, symp_i in enumerate(symptom_encod):
    new_col_names[i] = "symp_" + str(symp_i)

new_col_names.append("disease")

In [10]:
disease_encod = set(df['disease_encod'])
new_df = pd.DataFrame([[0]*len(new_col_names) for _ in range(len(disease_encod))], columns=new_col_names)

Unnamed: 0,symp_0,symp_1,symp_2,symp_3,symp_4,symp_5,symp_6,symp_7,symp_8,symp_9,...,symp_313,symp_314,symp_315,symp_316,symp_317,symp_318,symp_319,symp_320,symp_321,disease
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# test grabbing 'n'
df[(df.disease_encod == 1) & (df.symptom_encod == 312)][['n']].iloc[0]['n']

1

In [14]:
num_cols = len(new_df.columns)
def edit_row(rnge):
    start, end = rnge
    for disease in range(start, end+1):
        new_row = [0]*num_cols
        new_row[-1] = disease
        
        for j in range(num_cols-1):
            res = df[(df['disease_encod'] == disease) & (df['symptom_encod'] == j)]
            if len(res) > 0:
                new_row[j] = res[['n']].iloc[0]['n']

        # write over row in df
        new_df.iloc[disease] = new_row

In [27]:
# uncomment for re-processing
import threading

def start_threads(threads):
    for thread in threads:
        thread.start()

def join_threads(threads):
    for thread in threads:
        thread.join()

def multi_thread(n_rows, n_threads, proc):
    if n_threads < 1:
        return
    
    jump = n_rows // n_threads
    
    threads = []
    for i in range(0, n_rows, jump):
        # split range -> [start, end] --> feed to proc
        start, end = i, i+jump
        
        # extend for final thread
        if i // jump == n_threads:
            end = n_rows-1
            
        t = threading.Thread(target=proc, args=((start,end),))
        threads.append(t)
    print('Hello')
    start_threads(threads)
    join_threads(threads)

In [2]:
### Creates the re-formatted data-frame!
# multi_thread(len(new_df), 4, edit_row)

In [3]:
# uncomment when processing again
# new_df.to_csv('symptom_disease_dataset_reformatted.csv', index=False)

In [33]:
import pandas as pd
new_df = pd.read_csv('symptom_disease_dataset_reformatted.csv')

# make columns floating point data types
for col in new_df.columns[:-1]:
    new_df[col] = new_df[col].astype('float')

In [34]:
from sklearn.model_selection import train_test_split
import numpy as np

x_cols = new_df.columns[:-1].tolist()
X = new_df[pd.Index(x_cols)]
y = new_df['disease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
clf = RandomForestClassifier(n_estimators=100)

#X_train.reshape(-1,1)
def model_predict(X_train, y_train, X_test):
    clf.fit(X_train, y_train)
    return clf.predict(X_test)

def print_accuracy(y_test, y_pred):
    print("Accuracy: {}".format(metrics.accuracy_score(y_test, y_pred)))

In [35]:
print_accuracy(y_test, model_predict(X_train, y_train, X_test))

Accuracy: 0.0


## Goal: Artificially Augment the data
- 1. Identify distribution of data
- 2. Randomly create new data and append to dataframe

In [6]:
import plotly.graph_objects as go

def bar_plot(x, y):
    fig = go.Figure([go.Bar(x=x,y=y)])
    fig.show()

def disease_symptom_dist(disease_id):
    disease = new_df[new_df['disease'] == disease_id]
    y_vals = disease[disease.columns[:-1]].iloc[0].tolist()
    
    # create bar chart
    bar_plot(disease.columns[:-1], y_vals)

In [7]:
import numpy as np
def apply_noise(df, mu=0, sigma=0.1, round=True):
    n_rows, n_cols = len(df), len(df.columns)
    noise = np.random.normal(mu, sigma, [n_rows, n_cols])
    return df + noise

In [8]:
# not good enough
# capture mu and sigma for each symptom
def get_mu(col):
    return sum(col)/len(col) if len(col) else 0

def get_sigma(col, mu=0.1):
    n = len(col)
    return sum((col-mu)**2)/n

In [9]:
# overwrite new_df by taking row proportions
def row_proportion(row):
    row_sum = sum(row[:-1])
    for i, item in enumerate(row[:-1]):
        row[i] = item*1.0 / (row_sum if row_sum else 1)
    return row

In [10]:
def apply_row_proportion(bounds):
    start, end = bounds
    for i in range(start, end+1):
        new_df.iloc[i] = row_proportion(new_df.iloc[i])

In [11]:
import time

tik = time.clock()
### Applies row proportions
# multi_thread(len(new_df), 5, apply_row_proportion)
tok = time.clock()
print('Total time: {}'.format(tok-tik))

AttributeError: module 'time' has no attribute 'clock'

In [12]:
# new_df.to_csv('symptom_disease_dataset_row_proportions.csv', index=False)

In [13]:
# import row proportions
#
# EXECUTE THIS DOWN FOR ARTIFICAL DATA CREATION
#
#
import pandas as pd
new_df = pd.read_csv('symptom_disease_dataset_row_proportions.csv')

for col in new_df.columns[:-1]:
    new_df[col] = new_df[col].astype('float')
    
new_df.head()

Unnamed: 0,symp_0,symp_1,symp_2,symp_3,symp_4,symp_5,symp_6,symp_7,symp_8,symp_9,...,symp_313,symp_314,symp_315,symp_316,symp_317,symp_318,symp_319,symp_320,symp_321,disease
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,3.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,4.0


## Row-proportions complete!

Now what?

Artificial data creation...

Let's say for a given disease, we have proportions: `[0.1, 0.2, 0.7]`.

We randomly sample a number between 1-1000 for each symptom, we get: `[100, 250, 400]`

Now apply the proportions: `[10, 50, 280]`

Recalculate the row-proportions: `[.027, 0.205, .788]` This is our new row of data.

In [14]:
import random

def get_artificial_row(row):
    n = len(row)
    artificial_row = [0.0]*n
    
    # create new artificial counts
    for i, actual_proportion in enumerate(row[:-1]):
        artificial_row[i] = random.randint(0,1000) * actual_proportion
    
    # add disease to artificial row
    artificial_row[-1] = row[-1]
    
    return row_proportion(artificial_row)

In [15]:
# looking good, let's multithread and enhance!!

# create massive data frame, where each piece of data has 500 additional rows of artificial data
N_ADDITIONAL = 200

# stretch new_df
old_n_rows, old_n_cols = len(new_df), len(new_df.columns)
new_n_rows = old_n_rows + old_n_rows * N_ADDITIONAL
#big_df = pd.DataFrame([[0]*old_n_cols for _ in range(new_n_rows)], columns=new_df.columns)


def apply_artificial_data(bounds):
    start, end = bounds
    for i in range(start, end+1):
        # add N_ADDITIONAL new rows of data
        for j in range(1, N_ADDITIONAL + 1):
            big_data.iloc[i+j] = get_artificial_row(new_df.iloc[i])


In [16]:
# create 10 big data csvs
"""
big_data = [[0]*old_n_cols for _ in range(new_n_rows)]
i, jump = 0, len(big_data)//10

big_data_1 = big_data.iloc[i:i+jump]
i += jump
big_data_2 = big_data.iloc[i:i+jump]
i += jump
big_data_3 = big_data.iloc[i:i+jump]
i += jump
big_data_4 = big_data.iloc[i:i+jump]
i += jump
big_data_5 = big_data.iloc[i:i+jump]
i += jump
big_data_6 = big_data.iloc[i:i+jump]
i += jump
big_data_7 = big_data.iloc[i:i+jump]
i += jump
big_data_8 = big_data.iloc[i:i+jump]
i += jump
big_data_9 = big_data.iloc[i:i+jump]
i += jump
big_data_10 = big_data.iloc[i:]
i += jump
big_data_1.to_csv('big_data_1.csv')
big_data_2.to_csv('big_data_2.csv')
big_data_3.to_csv('big_data_3.csv')
big_data_4.to_csv('big_data_4.csv')
big_data_5.to_csv('big_data_5.csv')
big_data_6.to_csv('big_data_6.csv')
big_data_7.to_csv('big_data_7.csv')
big_data_8.to_csv('big_data_8.csv')
big_data_9.to_csv('big_data_9.csv')
big_data_10.to_csv('big_data_10.csv')
"""

"\nbig_data = [[0]*old_n_cols for _ in range(new_n_rows)]\ni, jump = 0, len(big_data)//10\n\nbig_data_1 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_2 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_3 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_4 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_5 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_6 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_7 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_8 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_9 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_10 = big_data.iloc[i:]\ni += jump\nbig_data_1.to_csv('big_data_1.csv')\nbig_data_2.to_csv('big_data_2.csv')\nbig_data_3.to_csv('big_data_3.csv')\nbig_data_4.to_csv('big_data_4.csv')\nbig_data_5.to_csv('big_data_5.csv')\nbig_data_6.to_csv('big_data_6.csv')\nbig_data_7.to_csv('big_data_7.csv')\nbig_data_8.to_csv('big_data_8.csv')\nbig_data_9.to_csv('big_data_9.csv')\nbig_data_10.to_csv('big_data_10.csv')\n"

In [17]:
import pandas as pd
big_data_1 = pd.read_csv('big_data_1.csv')
big_data_2 = pd.read_csv('big_data_2.csv')
big_data_3 = pd.read_csv('big_data_3.csv')
big_data_4 = pd.read_csv('big_data_4.csv')
big_data_5 = pd.read_csv('big_data_5.csv')
big_data_6 = pd.read_csv('big_data_6.csv')
big_data_7 = pd.read_csv('big_data_7.csv')
big_data_8 = pd.read_csv('big_data_8.csv')
big_data_9 = pd.read_csv('big_data_9.csv')
big_data_10 = pd.read_csv('big_data_10.csv')

big_data = big_data_1.append(big_data_2, ignore_index=True).append(big_data_3, ignore_index=True).append(big_data_4, ignore_index=True).append(big_data_5, ignore_index=True).append(big_data_6, ignore_index=True).append(big_data_7, ignore_index=True).append(big_data_8, ignore_index=True).append(big_data_9, ignore_index=True).append(big_data_10, ignore_index=True)

In [18]:
### Run this for artificial data creation!
multi_thread(len(big_data), 5, apply_artificial_data)

Exception in thread Thread-6:
Traceback (most recent call last):
  File "e:\desktop\lib\threading.py", line 932, in _bootstrap_inner
Exception in thread Thread-7:
Traceback (most recent call last):
  File "e:\desktop\lib\threading.py", line 932, in _bootstrap_inner
    Exception in thread Thread-8:
Traceback (most recent call last):
  File "e:\desktop\lib\threading.py", line 932, in _bootstrap_inner
self.run()
  File "e:\desktop\lib\threading.py", line 870, in run
Exception in thread Thread-9:
Exception in thread Thread-10:
Traceback (most recent call last):
  File "e:\desktop\lib\threading.py", line 932, in _bootstrap_inner
Traceback (most recent call last):
  File "e:\desktop\lib\threading.py", line 932, in _bootstrap_inner
        self.run()
  File "e:\desktop\lib\threading.py", line 870, in run
self.run()
  File "e:\desktop\lib\threading.py", line 870, in run
Exception in thread Thread-11:
Traceback (most recent call last):
  File "e:\desktop\lib\threading.py", line 932, in _bootst

In [19]:
big_data.head()

Unnamed: 0.1,Unnamed: 0,symp_0,symp_1,symp_2,symp_3,symp_4,symp_5,symp_6,symp_7,symp_8,...,symp_313,symp_314,symp_315,symp_316,symp_317,symp_318,symp_319,symp_320,symp_321,disease
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
print(big_data.iloc[0])

Unnamed: 0    0
symp_0        0
symp_1        0
symp_2        0
symp_3        0
             ..
symp_318      0
symp_319      0
symp_320      0
symp_321      0
disease       0
Name: 0, Length: 324, dtype: int64


In [21]:
for val in big_data.iloc[0]:
    print(val)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [22]:
big_data.head()

Unnamed: 0.1,Unnamed: 0,symp_0,symp_1,symp_2,symp_3,symp_4,symp_5,symp_6,symp_7,symp_8,...,symp_313,symp_314,symp_315,symp_316,symp_317,symp_318,symp_319,symp_320,symp_321,disease
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
big_data.columns

Index(['Unnamed: 0', 'symp_0', 'symp_1', 'symp_2', 'symp_3', 'symp_4',
       'symp_5', 'symp_6', 'symp_7', 'symp_8',
       ...
       'symp_313', 'symp_314', 'symp_315', 'symp_316', 'symp_317', 'symp_318',
       'symp_319', 'symp_320', 'symp_321', 'disease'],
      dtype='object', length=324)

In [24]:
del big_data['Unnamed: 0']

In [25]:
big_data.head()

Unnamed: 0,symp_0,symp_1,symp_2,symp_3,symp_4,symp_5,symp_6,symp_7,symp_8,symp_9,...,symp_313,symp_314,symp_315,symp_316,symp_317,symp_318,symp_319,symp_320,symp_321,disease
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
multi_thread(len(big_data), 5, apply_artificial_data)

Exception in thread Thread-13:
Traceback (most recent call last):
  File "e:\desktop\lib\threading.py", line 932, in _bootstrap_inner
Exception in thread Thread-14:
Exception in thread Traceback (most recent call last):
Thread-15:
Traceback (most recent call last):
  File "e:\desktop\lib\threading.py", line 932, in _bootstrap_inner
  File "e:\desktop\lib\threading.py", line 932, in _bootstrap_inner
        Exception in thread Thread-16:
Traceback (most recent call last):
  File "e:\desktop\lib\threading.py", line 932, in _bootstrap_inner
    Exception in thread self.run()self.run()
  File "e:\desktop\lib\threading.py", line 870, in run
Thread-17:
Traceback (most recent call last):
  File "e:\desktop\lib\threading.py", line 932, in _bootstrap_inner
self.run()
  File "e:\desktop\lib\threading.py", line 870, in run
        self.run()
  File "e:\desktop\lib\threading.py", line 870, in run

  File "e:\desktop\lib\threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
  Fi

In [28]:
big_data.head()

Unnamed: 0,symp_0,symp_1,symp_2,symp_3,symp_4,symp_5,symp_6,symp_7,symp_8,symp_9,...,symp_313,symp_314,symp_315,symp_316,symp_317,symp_318,symp_319,symp_320,symp_321,disease
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,0.0,0.040102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017913,0.069918,3.0


In [32]:
big_data.iloc[10000:10010]

Unnamed: 0,symp_0,symp_1,symp_2,symp_3,symp_4,symp_5,symp_6,symp_7,symp_8,symp_9,...,symp_313,symp_314,symp_315,symp_316,symp_317,symp_318,symp_319,symp_320,symp_321,disease
10000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
i, jump = 0, len(big_data)//10

big_data_1 = big_data.iloc[i:i+jump]
i += jump
big_data_2 = big_data.iloc[i:i+jump]
i += jump
big_data_3 = big_data.iloc[i:i+jump]
i += jump
big_data_4 = big_data.iloc[i:i+jump]
i += jump
big_data_5 = big_data.iloc[i:i+jump]
i += jump
big_data_6 = big_data.iloc[i:i+jump]
i += jump
big_data_7 = big_data.iloc[i:i+jump]
i += jump
big_data_8 = big_data.iloc[i:i+jump]
i += jump
big_data_9 = big_data.iloc[i:i+jump]
i += jump
big_data_10 = big_data.iloc[i:]
i += jump
big_data_1.to_csv('art_data_1.csv')
big_data_2.to_csv('art_data_2.csv')
big_data_3.to_csv('art_data_3.csv')
big_data_4.to_csv('art_data_4.csv')
big_data_5.to_csv('art_data_5.csv')
big_data_6.to_csv('art_data_6.csv')
big_data_7.to_csv('art_data_7.csv')
big_data_8.to_csv('art_data_8.csv')
big_data_9.to_csv('art_data_9.csv')
big_data_10.to_csv('art_data_10.csv')