In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [112]:
variants = pd.read_csv("training_variants", index_col=0)
test_variants = pd.read_csv("test_variants", index_col=0)
variants.head()

Unnamed: 0_level_0,Gene,Variation,Class
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,FAM58A,Truncating Mutations,1
1,CBL,W802*,2
2,CBL,Q249E,2
3,CBL,N454D,3
4,CBL,L399V,4


In [45]:
variants.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3321 entries, 0 to 3320
Data columns (total 3 columns):
Gene         3321 non-null object
Variation    3321 non-null object
Class        3321 non-null int64
dtypes: int64(1), object(2)
memory usage: 103.8+ KB


In [114]:
test_variants.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5668 entries, 0 to 5667
Data columns (total 2 columns):
Gene         5668 non-null object
Variation    5668 non-null object
dtypes: object(2)
memory usage: 132.8+ KB


In [116]:
variants.apply(lambda x: x.nunique())

Gene          264
Variation    2996
Class           9
dtype: int64

In [118]:
# most genes are unique in test data
test_variants.apply(lambda x: x.nunique())

Gene         1397
Variation    5628
dtype: int64

In [95]:
variants.Variation.value_counts().head(10)

Truncating Mutations    93
Deletion                74
Amplification           71
Fusions                 34
Overexpression           6
G12V                     4
T58I                     3
Q61R                     3
Q61L                     3
Q61H                     3
Name: Variation, dtype: int64

In [96]:
df[df.Variation == "G12V"]

Unnamed: 0_level_0,Gene,Variation,Class,text
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1278,HRAS,G12V,7,The three-dimensional structure of the complex...
3103,RAC1,G12V,7,Members of the RAS superfamily of small guanos...
3150,KRAS,G12V,7,The small G-protein NRAS is mutated in 22% of ...
3193,NRAS,G12V,7,To study the oncogenic role of the NRAS oncoge...


In [99]:
df[df.Variation == "Overexpression"]

Unnamed: 0_level_0,Gene,Variation,Class,text
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
87,CCNE1,Overexpression,7,A variety of results point to the transcriptio...
612,AURKB,Overexpression,2,Cancers arise owing to mutations in a subset o...
960,KDM5C,Overexpression,2,"Currently, few prognostic factors are availabl..."
1087,EZH2,Overexpression,7,EZH2 is a member of the polycomb group of gene...
1133,MET,Overexpression,7,BACKGROUND The c-met protooncogene encodes th...
1820,RHOA,Overexpression,7,Abstract Adhesion of tumor cells to host cel...


In [101]:
# number of classes is quite biased
variants.Class.value_counts()

7    953
4    686
1    568
2    452
6    275
5    242
3     89
9     37
8     19
Name: Class, dtype: int64

In [113]:
texts = pd.read_table("training_text", sep="\|\|", engine="python", squeeze=True)
texts.name = "Text"

test_texts = pd.read_table("test_text", sep="\|\|", engine="python", squeeze=True)
test_texts.name = "Text"

texts.head()

0    Cyclin-dependent kinases (CDKs) regulate a var...
1     Abstract Background  Non-small cell lung canc...
2     Abstract Background  Non-small cell lung canc...
3    Recent evidence has demonstrated that acquired...
4    Oncogenic mutations in the monomeric Casitas B...
Name: Text, dtype: object

In [52]:
texts.describe()

count                                                  3321
unique                                                 1921
top       The PTEN (phosphatase and tensin homolog) phos...
freq                                                     53
Name: Text, dtype: object

In [119]:
texts[0]

"Cyclin-dependent kinases (CDKs) regulate a variety of fundamental cellular processes. CDK10 stands out as one of the last orphan CDKs for which no activating cyclin has been identified and no kinase activity revealed. Previous work has shown that CDK10 silencing increases ETS2 (v-ets erythroblastosis virus E26 oncogene homolog 2)-driven activation of the MAPK pathway, which confers tamoxifen resistance to breast cancer cells. The precise mechanisms by which CDK10 modulates ETS2 activity, and more generally the functions of CDK10, remain elusive. Here we demonstrate that CDK10 is a cyclin-dependent kinase by identifying cyclin M as an activating cyclin. Cyclin M, an orphan cyclin, is the product of FAM58A, whose mutations cause STAR syndrome, a human developmental anomaly whose features include toe syndactyly, telecanthus, and anogenital and renal malformations. We show that STAR syndrome-associated cyclin M mutants are unable to interact with CDK10. Cyclin M silencing phenocopies CDK1

In [54]:
texts[0].count("FAM58A")

34

In [75]:
df = variants.copy()
df["text"] = texts
df.head()

Unnamed: 0_level_0,Gene,Variation,Class,text
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [83]:
# count appearance of each gene in the corresponding text
gene_counts = df.apply(lambda x: x[3].count(x[0]), axis=1)
gene_counts.sort_values().head()

ID
511     0
1299    0
647     0
648     0
527     0
dtype: int64

TP53 is tumor protein 53 which usually refered as p53.

In [81]:
df.loc[511]

Gene                                                      TP53
Variation                                                G245A
Class                                                        1
text         Tumor-derived mutant forms of p53 compromise i...
Name: 511, dtype: object

In [88]:
df[gene_counts == 0].head()

Unnamed: 0_level_0,Gene,Variation,Class,text
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
39,DICER1,E1705A,4,Human Dicer contains two RNase III domains (RN...
40,DICER1,D1810A,4,Human Dicer contains two RNase III domains (RN...
71,CCND2,Amplification,2,"The cyclin-dependent protein kinases, or cdks,..."
75,CCND3,Amplification,2,"The cyclin-dependent protein kinases, or cdks,..."
78,CCND3,T286A,7,The activities of cyclin D-dependent kinases s...


In [103]:
df[gene_counts == 0].Gene.unique()

array(['DICER1', 'CCND2', 'CCND3', 'TGFBR1', 'TGFBR2', 'EGFR', 'NKX2-1',
       'BRD4', 'ELF3', 'EP300', 'EPAS1', 'TP53', 'TP53BP1', 'SMAD2',
       'SMAD3', 'SMAD4', 'FBXW7', 'CDKN2A', 'ERBB2', 'ERBB3', 'ERBB4',
       'BRIP1', 'ERCC4', 'ABL1', 'PDGFRB', 'ESR1', 'EWSR1', 'MDM4',
       'FANCA', 'KMT2C', 'PIK3CA', 'PIK3CB', 'PIM1', 'PIK3R1', 'ARID5B',
       'HRAS', 'KMT2A', 'AKT1', 'FGFR3', 'RAD54L', 'PMS1', 'VHL', 'RRAS2',
       'FLT1', 'FLT3', 'RHOA', 'NCOR1', 'PPP2R1A', 'MTOR', 'CARD11',
       'CTNNB1', 'MAP2K1', 'MAP2K2', 'MYC', 'SOX9', 'AGO2', 'CCND1',
       'KEAP1', 'PTCH1', 'JAK1', 'AURKA', 'STK11', 'PTPN11', 'BRAF',
       'NFE2L2', 'GLI1', 'NFKBIA', 'GNAS', 'KIT', 'NOTCH1', 'DNMT3B',
       'KRAS', 'RAF1', 'RASA1', 'NTRK1', 'NTRK2', 'CASP8', 'RET', 'RUNX1'], dtype=object)

In [111]:
variants.groupby("Gene")["Class"].value_counts().head()

Gene   Class
ABL1   2        24
       7         2
ACVR1  7         2
       2         1
AGO2   1         4
Name: Class, dtype: int64