In [1]:
# Author: Helen Silva
# Version: 4.0
# Pre-processing of patients genetic (SNPs) data

### Import the necessary libraries

In [2]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import Imputer

# enable printing of large arrays
np.set_printoptions(threshold=np.nan)

### Get features and target data

In [3]:
# get the raw data
dna_df = pd.read_csv('../data/article-genetic-data.csv', sep=';')

# get the features and target data
genes = dna_df[:]
response = dna_df['response']

# check dataframe shape
genes.shape

(241, 120)

### Check missing values on features dataframe

In [4]:
print(genes.isnull().sum())

response       0
rs2229109      1
rs1128503      3
rs1045642      3
rs2235048      3
rs717620      29
rs2756104      7
rs2273697      8
rs2002042      8
rs4148396     25
rs3758395     13
rs3740067      8
rs17216317    10
rs3740066      7
rs3740065     10
rs1137968     12
rs8187710     13
rs162549      33
rs9341263     24
rs162562      25
rs162561      43
rs2551188     32
rs9341244     28
rs2404955     24
rs6956344     17
rs4646437     32
rs28988579    26
rs12114000    12
rs776746      29
rs28365067    20
              ..
rs2515642     30
rs2687111     24
rs2756109     39
rs28365083    16
rs28365087    34
rs28371730    51
rs28399419    21
rs28399429    38
rs2855658     25
rs2856844     35
rs3213619     36
rs3758580     16
rs3758581     19
rs4304697     20
rs4417205     38
rs4617515     16
rs4646457     31
rs4917623     19
rs4917639     37
rs4918797     31
rs4986879     34
rs743534      48
rs762551      16
rs7897079     32
rs7916649     42
rs9282564     46
rs9332104     22
rs9332168     

### Fill all missing values using a temporary integer value

In [5]:
genes = genes.fillna(0)

### Remove target data from features dataframe

In [6]:
genes = genes.drop('response', axis=1)

# set constant variables of dataframe shape values
TOTAL_SAMPLES = genes.shape[0] # 241
TOTAL_FEATURES = genes.shape[1] # 119

### Encode categorical variables into numerical variables

In [7]:
# set the numerical representation of SNPs values (categorical variables)
cleanup_nums = {'CC': 2, 'TT': 3, 'CG': 4, 'AA': 5, 'AC': 6, 'GT': 7, 'CT': 8, 'AG': 9, 'GG': 10, 'AT': 11, 'TC': 12}

# set a list containing only the numbers used to map the SNP values
nums_values = list(cleanup_nums.values())

# encode categorical variables to an integer form using the map dictionary
genes.replace(cleanup_nums, inplace=True)

### Impute missing data using most frequent value strategy

In [8]:
# impute most frequent SNP values on missing data (identified as zero)
imputer = Imputer(missing_values=0, strategy='most_frequent', axis=0)
filled_genes = pd.DataFrame(imputer.fit_transform(genes))

# check filled features dataframe
filled_genes.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,109,110,111,112,113,114,115,116,117,118
0,2.0,5.0,9.0,9.0,8.0,3.0,10.0,2.0,3.0,3.0,...,5.0,7.0,6.0,5.0,9.0,5.0,3.0,2.0,5.0,10.0
1,2.0,9.0,9.0,9.0,2.0,8.0,10.0,8.0,2.0,3.0,...,5.0,7.0,2.0,9.0,9.0,9.0,8.0,8.0,9.0,10.0
2,2.0,10.0,9.0,9.0,2.0,3.0,10.0,2.0,3.0,3.0,...,5.0,3.0,6.0,5.0,9.0,5.0,3.0,2.0,5.0,10.0
3,2.0,9.0,9.0,9.0,2.0,3.0,10.0,2.0,3.0,3.0,...,5.0,3.0,2.0,5.0,9.0,5.0,8.0,2.0,5.0,10.0
4,2.0,10.0,10.0,5.0,8.0,3.0,10.0,2.0,3.0,3.0,...,5.0,3.0,6.0,5.0,9.0,5.0,8.0,2.0,9.0,10.0


### Check missing values on features dataframe after handling with imputation

In [9]:
print(filled_genes.isnull().sum())

0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
      ..
89     0
90     0
91     0
92     0
93     0
94     0
95     0
96     0
97     0
98     0
99     0
100    0
101    0
102    0
103    0
104    0
105    0
106    0
107    0
108    0
109    0
110    0
111    0
112    0
113    0
114    0
115    0
116    0
117    0
118    0
Length: 119, dtype: int64


### Encode numerical variables into dummy variables using manual one-hot encoding method

In [10]:
# set a list to store the final one hot encoded dataset
onehot_encoded = list()

# one hot encode features of each sample
for i in np.arange(0,TOTAL_SAMPLES):
    
    # set a list to be used to group encoded values of a row (sample)
    row = list()
    
    # one hot encode each feature of the current sample
    for j in np.arange(0,TOTAL_FEATURES):
        
        # create the feature-length array for one hot encoding approach
        combinations = np.array([0 for _ in range(len(nums_values))])

        # identify and mark the index of the specific SNP value (number-based reference)
        combinations[nums_values.index(filled_genes.loc[[i]].values[0][j])] = 1

        # append the encoded feature
        row += list(combinations)
      
    # append the encoded sample
    onehot_encoded.append(row)

# check final one hot encoded dataset
print(onehot_encoded)

[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [11]:
# create a dataframe for the final one hot encoded values
df = pd.DataFrame(onehot_encoded)
print(df.shape)

# check final dataframe values
df.head()

(241, 1309)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1299,1300,1301,1302,1303,1304,1305,1306,1307,1308
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### Export preprocessed data using Pickle

In [12]:
# export preprocessed data as pickle objects
pickle.dump(df, open("../data/preprocessed/article-genetic-data-features.p", "wb"))
pickle.dump(response, open("../data/preprocessed/article-genetic-data-labels.p", "wb"))

### Export preprocessed data as CSV file using Pandas

In [13]:
# concatenate target and feature data to get the complete dataset
preprocessed_dataset = pd.concat([response, df], axis=1)

# export dataset csv file
preprocessed_dataset.to_csv('../data/preprocessed/preprocessed-article-genetic-data.csv', index=False)