In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
myxls = '2010_ExpressionAnalysisCompare.xlsx'
x1 = pd.ExcelFile(myxls)
mydf = x1.parse('RFClassComp')

mydf

In [None]:
# extracting feature number and importance
Ori_Label = ['a','b','c','d','e','f','g']
Tst_Label = ['h','i','k']
mytst = 3
XName = '# Features'
YName = 'Sum Top 3 FI'

Feat_Num = mydf[XName][:-mytst].values
Feat_Imp = mydf[YName][:-mytst].values 
TstF_Num = mydf[XName][-mytst:].values 
TstF_Imp = mydf[YName][-mytst:].values 
PCorr = np.corrcoef(Feat_Num, Feat_Imp)[0,1]
m,n = np.polyfit(Feat_Num, Feat_Imp, 1)

plt.rc('axes', labelsize=14) 
plt.rc('xtick', labelsize=12) 
plt.rc('ytick', labelsize=12) 
plt.rc('legend', fontsize=14) 
X = np.linspace(np.min(Feat_Num),np.max(Feat_Num),100)

plt.plot(X, X*m+n, 'k:')
plt.scatter(Feat_Num, Feat_Imp, s=100, marker='X')
# annotating scatter
xshift = [5,5,5,5,-15,-15,-15] # [5,5,-80,5,-120,-135,-66]
yshift = [0,0,.01,0,0,-.01,0] # [0,-0.01,.01,0,-.0075,-.01,-.025]
for Indx,Source in enumerate(Ori_Label): # mydf.columns[1:-mytst]
    plt.text(Feat_Num[Indx]+xshift[Indx], Feat_Imp[Indx]+yshift[Indx], Source)

# adding sequence subsets
plt.scatter(TstF_Num, TstF_Imp, s=100, c='r', marker='x')
tstxshift = [8,8,-15] # [5,5,-80]
tstyshift = [0,0,0] # [0,-0.01,.01]
for Indx,Source in enumerate(Tst_Label): #mydf.columns[-mytst:]
    plt.text(TstF_Num[Indx]+tstxshift[Indx], TstF_Imp[Indx]+tstyshift[Indx], Source)

plt.xlabel('Feature Number')
plt.ylabel('Sum of Top 3 Feature Importance')
R2Legend = 'y={:.3f}x+{:.1f}, R$^2$={:.2f}'.format(m,n,PCorr)
# RegrForm = r'$\sum_{i=0}^\infty x_i$'
plt.legend([R2Legend])
plt.style.use('seaborn-paper')
plt.savefig('Feature_Number-vs-Importance.png', bbox_inches='tight')

In [None]:
plt.rc('axes', labelsize=14) 
plt.rc('xtick', labelsize=12) 
plt.rc('ytick', labelsize=12) 
plt.rc('legend', fontsize=14) 

XName = '# Train'
Y1Name = 'f1 CrossV Mean'
Y2Name = 'f1 CrossV Std'
Y3Name = 'coefficient Var.'

TrainSet = mydf[XName][:-mytst].astype(float).values
f1Mean = mydf[Y1Name][:-mytst].astype(float).values
f1Std = mydf[Y2Name][:-mytst].astype(float).values
f1CV = mydf[Y3Name][:-mytst].astype(float).values

Tst_TrainSet = mydf[XName][-mytst:].astype(float).values
Tst_f1Mean = mydf[Y1Name][-mytst:].astype(float).values
Tst_f1Std = mydf[Y2Name][-mytst:].astype(float).values
Tst_f1CV = mydf[Y3Name][-mytst:].astype(float).values


plt.scatter(TrainSet, f1CV, s=100, marker='X')
xshift = [-110,50,75,-110,50,50,50]
yshift = [-0.01,0,0,0,-.0075,0,-.025]
for Indx,Source in enumerate(Ori_Label):
    plt.text(TrainSet[Indx]+xshift[Indx], f1CV[Indx]+yshift[Indx], Source)

plt.scatter(Tst_TrainSet, Tst_f1CV, c='r', s=100, marker='x')
tstxshift = [-125,75,50]
tstyshift = [-0.01,-.01,0]
for Indx,Source in enumerate(Tst_Label):
    plt.text(Tst_TrainSet[Indx]+tstxshift[Indx], Tst_f1CV[Indx]+tstyshift[Indx], Source)

plt.xlabel('Training Samples')
plt.ylabel('f1 coefficient of variation')
R2legend_orig = 'Published libraries'
R2legend_test = 'Library cuts'
# RegrForm = r'$\sum_{i=0}^\infty x_i$'
plt.legend([R2legend_orig, R2legend_test])
plt.style.use('seaborn-paper')
plt.savefig('SampleTrain-vs-f1CV.png', bbox_inches='tight')

plt.show()


In [None]:
plt.rc('axes', labelsize=14) 
plt.rc('xtick', labelsize=12) 
plt.rc('ytick', labelsize=12) 
plt.rc('legend', fontsize=14) 

plt.errorbar(Ori_Label, f1Mean, f1Std, fmt='o', color='b', ecolor='lightblue', elinewidth=5, capsize=0)
plt.errorbar(Tst_Label, Tst_f1Mean, Tst_f1Std, fmt='o', color='r', ecolor='pink', elinewidth=5, capsize=0)
plt.xlabel('Promoter library')
plt.ylabel('f1 avg and std with cross val.')
R2legend_orig = 'Published libraries'
R2legend_test = 'Library cuts'
plt.legend([R2legend_orig, R2legend_test])

plt.savefig('Libr-vs-f1AvgStd.png', bbox_inches='tight')

plt.show()

In [None]:
XName = 'Avg Seq Dist'
AvSqDist = mydf[XName][:-mytst].astype(float).values
Tst_AvSqDist = mydf[XName][-mytst:].astype(float).values

plt.rc('axes', labelsize=14) 
plt.rc('xtick', labelsize=12) 
plt.rc('ytick', labelsize=12) 
plt.rc('legend', fontsize=14) 

plt.scatter(AvSqDist, f1CV, s=100, marker='X')
xshift = [.01,.01,.01,.01,.01,.01,.01]
yshift = [0,0,0,0,0,0,0]
for Indx,Source in enumerate(Ori_Label):
    plt.text(AvSqDist[Indx]+xshift[Indx], f1CV[Indx]+yshift[Indx], Source)

plt.scatter(Tst_AvSqDist, Tst_f1CV, c='r', s=100, marker='x')
tstxshift = [.01,.01,.01]
tstyshift = [0,0,0]
for Indx,Source in enumerate(Tst_Label):
    plt.text(Tst_AvSqDist[Indx]+tstxshift[Indx], Tst_f1CV[Indx]+tstyshift[Indx], Source)

plt.xlabel('Sequence diversity')
plt.ylabel('f1 coefficient of variation')
R2legend_orig = 'Published libraries'
R2legend_test = 'Library cuts'
# RegrForm = r'$\sum_{i=0}^\infty x_i$'
plt.legend([R2legend_orig, R2legend_test])
plt.style.use('seaborn-paper')
plt.savefig('SeqDiv-vs-f1CV.png', bbox_inches='tight')

plt.show()
