
# Data cleaning and feature engineering

---

In [327]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import MLPclassifier


<br>

## Load data and rename columns

#### Training data:

- 29k rows x 10 cols, 
- 'Beta' is outcome: methylated or not 
- Chromosomes 1-10

#### Test: 
- 20,611 rows, no outcome labels
- Chromosomes 11-22

In [244]:
train = pd.read_csv('data/train.csv')

# give the data names that don't suck
train = train.rename(columns={"Id": "id",
                              "CHR": "chromosome", 
                              "MAPINFO": "position",
                              "UCSC_CpG_Islands_Name": "island",  
                              "UCSC_RefGene_Group":"refgene",
                              "Relation_to_UCSC_CpG_Island": "rel_to_island",
                              "Regulatory_Feature_Group": "feature",
                              "Forward_Sequence":"fwd_seq",
                              "Beta": "outcome"})

# change categorical variables dtypes
for col in ["rel_to_island", "outcome"]:
    train[col] = train[col].astype("category")
for col in ["fwd_seq", "seq", "refgene"]:
    train[col] = train[col].astype("string")
train['position'] = train['position'].astype('float64')

y = train['outcome']
df = train.drop(['id', 'chromosome', 'outcome'], 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29065 entries, 0 to 29064
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   position       29065 non-null  float64 
 1   island         23717 non-null  object  
 2   refgene        24822 non-null  string  
 3   rel_to_island  23717 non-null  category
 4   feature        16421 non-null  object  
 5   fwd_seq        29065 non-null  string  
 6   seq            29065 non-null  string  
dtypes: category(1), float64(1), object(2), string(3)
memory usage: 1.4+ MB


---

<br>

## Features of the data 

- `id`: unique identifiers 
  - not useful for training, useful for data handling though  
  
  
- `chromosome, position` (exact CpG site): 
  - not really sure how to use this given that test is from different chromosomes  
  - don't want classifier to memorize positions (values = n)  
  - could derive other features like size of island, placement of cg inside of island  
    
    
- `island`: chr + position range of island  
  - again, very specific to each site which could lead to over training  
  - some missing data (4k)  
  
- `refgene`: UCSC_RefGene_Group
  - 1089 unique values...
  - contains list of tags about functional elements:
      - TSS* {200, .}
      - 1st exon
      - Body
      - 5'UTR
  - Each can have multiple tags, even multiple of same tags....
  - Maybe split into counts for each tag: columns [TSS200, TSS1500, exon1, body, utr5, ...]

Body:14797  TSS200:10935   5'UTR:9117 1stExon:6928 TSS1500:7634   3'UTR:866    NA's:4243 
                           

- `feature`: lots of missing data, seems like 2 different factors
  - Promoter/Gene/NonGene_Associated or Unclassified  
      - not Cell_type_specific
  - Promoter/Gene/NonGene_Associated_Cell_type_specific or Unclassified_Cell_type_specific
  - NA (~12k)
  
  
- `relation_to_island`:
  - 5 levels: Island:18269, S_Shore:2107, N_Shelf: 529, N_Shore: 2378, S_Shelf: 434, NA's: 5348
  
  
- `Fwd_seq` and `seq`:
  - not sure what is the relationship between these?
  - `Fwd_seq` has the [CG] site marked and are all 124 bp long
  - `seq` is 2kbp of sequence

  
----

## Nominal data
  
  <br>

In [245]:
# refgene has lists of tags
print(df['refgene'].unique()[:10])

<StringArray>
[                                'Body;Body',
                                    'TSS200',
                               "5'UTR;5'UTR",
                                      'Body',
                           '1stExon;1stExon',
                        "5'UTR;TSS200;5'UTR",
                                        <NA>,
                       'Body;Body;Body;Body',
 "5'UTR;1stExon;1stExon;5'UTR;1stExon;5'UTR",
                           'TSS1500;TSS1500']
Length: 10, dtype: string


In [246]:
# feature has 2 categories: classes (celltype_specific or not)
# and (promoter / gene / non-gene / unclassified)
print(df['feature'].unique())

[nan 'Promoter_Associated' 'Unclassified' 'Gene_Associated'
 'Unclassified_Cell_type_specific'
 'Promoter_Associated_Cell_type_specific' 'NonGene_Associated'
 'Gene_Associated_Cell_type_specific'
 'NonGene_Associated_Cell_type_specific']


In [249]:
# relation to island is one of 5 levels or unknown
print(list(df['rel_to_island'].unique()), '\n')


['Island', nan, 'S_Shore', 'N_Shelf', 'N_Shore', 'S_Shelf'] 



---

## Making dummy variables for categories

In [250]:

## get dummies for "Relation_to_UCSC_CpG_Island": 5 levels
df = pd.get_dummies(df, columns =['rel_to_island'], prefix_sep = '', prefix = '')

## pull terms from 'UCSC_RefGene_Group' lists into columns of counts
for term in ["TSS200", "TSS15000", "Body", "5'UTR", "3'UTR", "1stExon"]:
    df[term] = df["refgene"].str.count(term)
    df[term] = df[term].fillna(0).astype('int32')

## create 2 sets of dummies from 'feature' (Regulatory_Feature_Group)
df["cell_type_specific"] = df['feature'].str.count("_Cell_type_specific").fillna(0).astype('int32')
for term in ["Gene_Associated", "NonGene_Associated", "Promoter_Associated", "Unclassified"]:
    df[term] = df['feature'].str.count(term).fillna(0).astype('int32')

## postion of CpG relative to nearby island - lots of missing values though
df['isl_start'] = df['island'].str.extract(':(\d+)').astype('float64')
df['pos_to_start'] = df['position'] - df['isl_start']
df['isl_end'] = df['island'].str.extract('-(\d+)').astype('float64')
df['pos_to_end'] = df['isl_end'] - df['position']


df = df.drop(columns = ['feature', 'isl_start', 'isl_end', 'position', 'island', 'refgene'])

In [251]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29065 entries, 0 to 29064
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   fwd_seq              29065 non-null  string 
 1   seq                  29065 non-null  string 
 2   Island               29065 non-null  uint8  
 3   N_Shelf              29065 non-null  uint8  
 4   N_Shore              29065 non-null  uint8  
 5   S_Shelf              29065 non-null  uint8  
 6   S_Shore              29065 non-null  uint8  
 7   TSS200               29065 non-null  int32  
 8   TSS15000             29065 non-null  int32  
 9   Body                 29065 non-null  int32  
 10  5'UTR                29065 non-null  int32  
 11  3'UTR                29065 non-null  int32  
 12  1stExon              29065 non-null  int32  
 13  cell_type_specific   29065 non-null  int32  
 14  Gene_Associated      29065 non-null  int32  
 15  NonGene_Associated   29065 non-null 

---

## Encoding sequences

There are shorter regions (60 bp up and downstream) and longer sequences (2kbp)

In [336]:
## split the upstream and downstream seq around CpG site
short_lr = df['fwd_seq'].str.split('\[|\]', expand = True).drop(1, axis = 1).rename(columns={0:'short_l', 2:'short_r'})

In [299]:
df = pd.concat([df, l_and_r], axis=1)

In [337]:
print(df['short_l'][1])
print(set(len(x) for x in df['short_l']))
print(set(len(x) for x in df['short_r']))

GGACCACACTGCCATGGCAACAGCGTGCCTCTGCGTCCTCCATCCGGGCCTCTCTAACTA
{60}
{60}


In [338]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29065 entries, 0 to 29064
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   fwd_seq              29065 non-null  string 
 1   seq                  29065 non-null  string 
 2   Island               29065 non-null  uint8  
 3   N_Shelf              29065 non-null  uint8  
 4   N_Shore              29065 non-null  uint8  
 5   S_Shelf              29065 non-null  uint8  
 6   S_Shore              29065 non-null  uint8  
 7   TSS200               29065 non-null  int32  
 8   TSS15000             29065 non-null  int32  
 9   Body                 29065 non-null  int32  
 10  5'UTR                29065 non-null  int32  
 11  3'UTR                29065 non-null  int32  
 12  1stExon              29065 non-null  int32  
 13  cell_type_specific   29065 non-null  int32  
 14  Gene_Associated      29065 non-null  int32  
 15  NonGene_Associated   29065 non-null 

In [311]:
df['seq'].str.split('\[|\]', expand = True)

Unnamed: 0,0
0,GCGCTTCTTTGCCCCGATGAGTTCGCCTCCCCAAACGCCTACTTCG...
1,AGTAAGAGACGGAAATAAATTCCTTCCTCCCTGAGTGTCTGGTAAA...
2,AAAGGACTGAAATGCCCAGCAGGTGCTCAAGAATTGCTACCATGGC...
3,TTCTTGAAGATAACTTTCCAGAAGTACAATTTCTTGAGTCAAAGGG...
4,AAAGGAAGCAAGACGTTAGGAAAGATTAAAGACTGCACGATTTTAA...
...,...
29060,AAGAGACTCTGCAGGCTGCCCAGGGTGGTGAGAACAATGATGATGG...
29061,TTACTCAGACGCTGCAGGTTGTGAATTTCAGTCCTGGTAAATCATG...
29062,CTCCAAGCCTGAAGAGCCTCTGCCTTCCCTTCCCTCACTCTGCGTG...
29063,TCATGGCGGCAGGGATGGAAGCTATGCATGGGTTCAGCAACATGGA...


In [325]:
for x in range(10):
    pos = df['seq'][x].find(re.sub(r'[\[|\]]', '', df['fwd_seq'][x]))
    print(pos)
    df['seq'][x][pos:pos+122]

939
939
939
939
939
939
939
939
939
939


4096

In [None]:
del(train)

## Test data


In [None]:
## Same thing for test data
test = pd.read_csv('data/test.csv')
test = test.rename(columns={"Id": "id",
                              "CHR": "chromosome", 
                              "MAPINFO": "position",
                              "UCSC_CpG_Islands_Name": "island",  
                              "UCSC_RefGene_Group":"refgene",
                              "Relation_to_UCSC_CpG_Island": "rel_to_island",
                              "Regulatory_Feature_Group": "feature",
                              "Forward_Sequence":"fwd_seq"})
# change categorical variables dtypes
test["rel_to_island"] = test["rel_to_island"].astype("category")

In [None]:
test