In [1]:
%pylab
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cross_decomposition import PLSRegression
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import zscore
!{sys.executable} -m pip install mplcursors
import mplcursors



DeSeqOutputAllConds = 'DeSeqOutputAllConds.tsv'
si2_si4_RNA_seq = 'si2-si4_RNA-seq-pipeline-output-normalized.tsv'


def getTSVFile(filepath):
    data = pd.read_csv (filepath, sep = '\t')
    return data

  
def zScoreData(df, columnsToZScore):
    for column in columnsToZScore:
        df[column] = zscore(df[column])
        
    return df

def fixBadChromosomeLabeling(df, column='chrom'): 
    replace = list(range(1, 23)) + ['X', 'Y']
    
    for i in replace:
        df[column].replace(str(i), 'chr' + str(i), inplace=True)
        
    return df


Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [2]:
df = getTSVFile(si2_si4_RNA_seq)
df = df.loc[df['counts'] > 0]#Filter out rows where count is zero
df = fixBadChromosomeLabeling(df)

#Set up copies to play around with lader
df_original = df.copy()

#Remove unnecessary columns
#['experiment', 'sampleID', 'counts', 'tpm', 'gene_name', 'chrom'] Leon..KEEP
df = df[['sampleID', 'tpm', 'gene_name']]

#display(df.sampleID.unique())

df = zScoreData(df, ['tpm'])


In [3]:
df_test = df_original.copy()
df_test = df_test.loc[df_test['counts'] > 0]


print(len(df_test['gene_name'].unique()))

#Group by gene and therapeutic level, sorting by average tpm value w/ highest tpm first, 
#getting top 20

#print(len(df_test[['gene_name'== 'MT-CO2', 'sampleID' =='35-TGFb-and-RA-high']]))
df_test.query('gene_name=="MT-CO2" & sampleID=="35-TGFb-and-RA-high"')



#Homework: Get top 10 results, for gene_name, sampleID, and both and PLOT IT
df_test.groupby(['gene_name', 'sampleID' ]).mean().sort_values(by=['tpm'], ascending=False)[:20]


#1. One hot encode the genes. When they are numbers they wont disappear using 'groupby'
#2. Groupby by sampleID(treatments)
#3. Remove columns [counts, rpm, rpkm, tx_start, tx_end, TSS_loc]
#4. Transpose the dataframe so the genes are the row
#5. Sort by tpm and get top n results. So you top 20 genes
#6. Transpose it back and now you have top 20 columns




22112


Unnamed: 0_level_0,Unnamed: 1_level_0,counts,rpm,rpkm,tpm,tx_start,tx_end,TSS_loc
gene_name,sampleID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MT-CO2,35-TGFb-and-RA-high,85573.0,7775.92597,11368.312822,40095.437187,7585.0,8269.0,7585.0
MT-CO2,31-TGFb-high,82408.0,7040.157466,10292.627874,37089.321282,7585.0,8269.0,7585.0
MT-CO2,14-EtOH-highDensity,56087.0,6669.206547,9750.301969,34573.327005,7585.0,8269.0,7585.0
MT-CO2,20-EtOH-halfDensity,60635.0,5830.951732,8524.783234,30770.942552,7585.0,8269.0,7585.0
MT-CO2,36-RA-low,63051.0,5752.710893,8410.396043,30730.360023,7585.0,8269.0,7585.0
MT-CO2,12-RA-med,53715.0,5811.451269,8496.273785,30633.995982,7585.0,8269.0,7585.0
MT-CO2,33-TGFb-low,60700.0,5633.969102,8236.796933,30175.431961,7585.0,8269.0,7585.0
MT-CO2,32-RA-high,62147.0,5616.004189,8210.53244,29896.22299,7585.0,8269.0,7585.0
MT-CO2,34-EtOH-highDensity,60860.0,5471.593654,7999.405927,29754.583288,7585.0,8269.0,7585.0
MT-CO2,27-EtOH-nlDensity,63087.0,5543.266864,8104.191322,29703.706371,7585.0,8269.0,7585.0


In [4]:
# Retinoic Acid (50, 200 and 400 nM)
# TGF-β (1.25, 5, and 10 ng/mL)


sampleIDs_transform = {
  'RA-high': {'TGFb': 0, 'retinoicAcid': 400, 'EtOH': 0}, 
  'EtOH-nlDensity': {'TGFb': 0, 'retinoicAcid': 0, 'EtOH': 0},
  'RA-med': {'TGFb': 0, 'retinoicAcid': 200, 'EtOH': 0}, 
  'EtOH-halfDensity': {'TGFb': 0, 'retinoicAcid': 0, 'EtOH': 0}, 
  'RA-low': {'TGFb': 0, 'retinoicAcid': 50, 'EtOH': 0},
  'EtOH-highDensity': {'TGFb': 0, 'retinoicAcid': 0, 'EtOH': 0}, 
  'TGFb-and-RA-low': {'TGFb': 1.25, 'retinoicAcid': 50, 'EtOH': 0}, 
  'TGFb-and-RA-high': {'TGFb': 10, 'retinoicAcid': 400, 'EtOH': 0},
  'TGFb-and-RA-med': {'TGFb': 5, 'retinoicAcid': 200, 'EtOH': 0}, 
  'TGFb-high': {'TGFb': 10, 'retinoicAcid': 0, 'EtOH': 0}, 
  'TGFb-low': {'TGFb': 1.25, 'retinoicAcid': 0, 'EtOH': 0}, 
  'TGFb-med': {'TGFb': 5, 'retinoicAcid': 0, 'EtOH': 0}
}

In [5]:
#https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas
def combineTreatmentColumns(df, exceptions):
    df.rename(columns=lambda x: x[3:] if x not in exceptions else x, inplace=True)
    return df

def changeTreatmentNameInRows(df, columnName='sampleID', exceptions={}):
    df[columnName] = df[columnName].apply(lambda x: x[3:] if x not in exceptions else x)
    return df


In [6]:
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

pca = PCA(n_components=3)

#if you increase the number of PCs you get a lot more variance capture


def printPCA_2D(new_df, title):
    pca.fit(new_df)

    print('''
    Variance for PCs = %s 
    Sum of variances = %.03f  
    pca.shape = %s where rows=loadings=%d, columns=PCs=%d''' 
          % (pca.explained_variance_ratio_, np.sum(pca.explained_variance_ratio_), 
             pca.components_.shape, pca.components_.shape[1], pca.components_.shape[0]))
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title(title)
    
    print(pca.components_)

    for i in range(pca.components_.shape[1]):
        print(pca.components_[0,i])
        print(pca.components_[1,i])
        plt.scatter(pca.components_[0, i], pca.components_[1, i])

    mplcursors.cursor(hover=True)

    plt.legend(bbox_to_anchor=(1.35, 1)) 
    plt.show()

    
def printPCA_3D(new_df, title):
    pca.fit(new_df)

    print('''
    Variance for PCs = %s 
    Sum of variances = %.03f  
    pca.shape = %s where rows=loadings=%d, columns=PCs=%d''' 
          % (pca.explained_variance_ratio_, np.sum(pca.explained_variance_ratio_), 
             pca.components_.shape, pca.components_.shape[1], pca.components_.shape[0]))
    
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title(title)

    for i in range(pca.components_.shape[1]):
        ax.scatter(xs=pca.components_[0, i], ys=pca.components_[1, i], 
                       zs=pca.components_[2, i], label=new_df.columns[i])


    plt.legend(bbox_to_anchor=(1.6, 1)) 
    plt.show()


def getPCA_DFs(groupby=['sampleID', 'gene_name'], keepColumns=['sampleID', 'tpm', 'gene_name'], topN=5000):
    df_pca = df_original.copy()
    df_pca = df_pca[keepColumns]

    #Group data by two columns, which will later be a regular row and column
    gb = df_pca.groupby(groupby).mean().sort_values(by=['tpm'], ascending=False)[:topN]
    
    
    display(gb)
    #Extract rows and columns of top N results by indices
    rows = set([groupby[0] for groupby in gb.index])
    columns = set([groupby[1] for groupby in gb.index])

    #Create a new dataframe
    new_df = pd.DataFrame(0, index=rows, columns=columns)
    new_df_log = new_df.copy()

    #Iterate
    for (row, col) in gb.index:
        tpv_val = gb.loc[row, col].tpm
        new_df.loc[row, col] = tpv_val
        new_df_log.loc[row, col] = np.log(tpv_val)
        
    return new_df, new_df_log

#From Meyer treatments as rows, genes as columns


groupby=['sampleID', 'gene_name'] #Should only be two things! groupby[0] = row, groupby[1] = column
new_df, new_df_log = getPCA_DFs(groupby=groupby, topN=27311)

#display(new_df)

printPCA_2D(new_df_log, title='Loadings Plot for Log-Normal PCA' )
#printPCA_2D(new_df, title='Loadings Plot for Normal PCA' )



Unnamed: 0_level_0,Unnamed: 1_level_0,tpm
sampleID,gene_name,Unnamed: 2_level_1
35-TGFb-and-RA-high,MT-CO2,40095.437187
31-TGFb-high,MT-CO2,37089.321282
14-EtOH-highDensity,MT-CO2,34573.327005
20-EtOH-halfDensity,MT-CO2,30770.942552
36-RA-low,MT-CO2,30730.360023
...,...,...
32-RA-high,CDC42,195.066168
06-RA-high,EIF3A,195.058901
31-TGFb-high,MEA1,195.054882
51-RA-med,POLR2G,195.054585



    Variance for PCs = [0.30219037 0.17461947 0.08014898] 
    Sum of variances = 0.557  
    pca.shape = (3, 1011) where rows=loadings=1011, columns=PCs=3
[[-1.00284983e-03  5.62560690e-03 -1.94685402e-03 ... -2.02973123e-04
   2.99764120e-04  6.20797343e-02]
 [ 2.58075292e-02  1.49986715e-02  2.90956628e-03 ... -3.04281377e-05
   8.06318662e-03 -7.33341325e-02]
 [-1.01417489e-02 -3.38072432e-03 -7.51693242e-04 ...  4.57851280e-04
  -9.28825630e-03  8.40728985e-03]]
-0.001002849828900175
0.025807529183093317
0.005625606898350075
0.014998671518138199
-0.0019468540176196356
0.0029095662757322497
-0.022051967874358296
-0.00983592396020881
-0.0005282479296584071
-0.000395811431276633
0.008309929320177647
0.005756098450904481
0.07155825079282704
0.07540336651833589
-0.0010089217052895929
-0.00025258497482577195
0.00034407226960478545
0.0031829180079434045
0.07438437486948062
0.006028521979138299
0.009541095093064082
0.0008191812496134413
0.0034297362893525294
0.0005320633349252313
-0.0686

-0.07247115551579147
0.011305572923605888
-0.0007718508567179438
0.002634439185859286
-0.054421346120202725
0.003912628067033478
0.013540614546163302
0.002882038436247192
-0.0020557494025136387
0.001350070927192375
0.015409920164577937
0.07381537175680976
-0.0011042181829417057
-0.0022198185123869308
0.005797162170305296
0.007058855526382574
0.003829616587577846
0.013916175008213055
0.0039271155929794304
-0.0014850353545595638
0.06745442811126819
0.003102445183311956
0.019623657091217567
0.002125635599617117
-0.003916995750995222
-0.0032469951804998463
-0.014059516852032923
0.047119374546213295
-0.04818899077701049
-0.014766392700022852
-0.0519984313648343
0.01100278088968681
0.018123795590505826
0.016838446616122573
-0.003518144131046097
0.0037035911063724126
-0.0028527761692022082
0.0044231146201300425
-0.01404767814719295
-0.025608701110247625
-0.003372289013487683
-0.0022945814841792323
0.002213917536143728
-0.0018831979301886898
0.0018622263094161668
-0.001319841977311288
0.083807

0.0019223672177045741
0.00032766603521568654
-0.06403953957403305
-0.05264415941999728
-0.05168232373053309
0.012625191145278937
0.0026679541947669315
-0.0019257225293928227
0.0008746157232022496
-0.0006499692276804577
-0.0013488256215622768
0.0025265259270328146
-0.0011353494021663021
-5.740802562244122e-05
-0.0074490343068722275
0.01681152536390138
0.03195318928950953
0.05491452111399715
-0.004323478204581187
-7.265196080816711e-05
-0.0349893273758697
-0.016605731923242457
-0.0017872922633494774
0.0075576872614965965
-0.010319198655397136
0.017294653520231434
0.002604102243424902
-0.002055272713496478
-0.008885628300096727
0.0012214442361429987
0.024620298564623277
0.029434224166512736
0.0012456611042052626
0.013632063438121685
0.03161733998022952
0.025022110180034224
-0.0009278752144462055
-0.0009739381158695668
0.0031554104313971177
0.02109811522143603
0.000974726474081148
0.0017880724735495229
0.03986560415097024
0.014913293574750533
0.048291952922284026
-0.10629402938242692
0.001

-0.019733987598913767
0.03677436719132029
0.046790371947371616
-0.027369223220675507
0.07459916946744169
-0.09759785521134134
-0.021163803404918267
0.004356639714744226
0.0066267994794725685
0.005411247427241951
-0.0730410634617968
-0.08140519595836253
0.0006193205716597261
0.002143380820447157
0.01119085649265317
0.03607712422015535
0.01980583642718963
0.0021453681804972672
0.00047553753481566506
0.00047056303445045666
-0.03860076563135345
0.03202091008540632
0.012920043360991451
0.0022538458761344076
-0.0004724154718379486
-0.0007118321793002589
0.02396245948101359
0.040169562919117725
-0.014963077463658513
0.03539569698017744
-0.04174510222944736
0.07086364210241747
0.0004469285031771254
0.0018435992653801467
-0.01921156528187368
0.07245021874403552
0.0506979323630668
-0.005312145522743909
0.07033996838738635
-0.07647564497691361
-0.012559945048214387
-0.04035037164205811
-0.06557798870646636
0.043458362648964355
0.0019052722465116897
-0.002638026032863787
-0.005302747834062908
0.00

-0.001218135678031573
0.004199663287869289
-0.0039386213729841685
-0.004737563828078745
0.0007888509944171355
-0.0003812990196420708
0.0127143505394485
-0.01334312670139606
-0.06052003752350142
0.01438101797382026
0.013133541829826565
-0.02362695753176444
0.0004748738200517367
0.0001647329673564221
0.0004125371618062523
0.005400929368057066
-0.0016278486911022138
0.001206621663458687
0.00294670670275414
-0.009103108815324713
-0.07758258567032512
0.02579917972935107
0.0006975893957956823
-0.00017602289609602842
-7.492104114039428e-05
0.0004039964485104173
0.007657245381500719
0.002570224374545652
-0.005703648576165364
-0.008499338941584068
-0.004082427047368641
0.0011259654061524789
-0.0005076801832008897
-0.0025455706088985805
0.0011567614290112088
0.00012290766386332047
-0.014807541983211311
0.029951658597034924
0.002271114010843022
0.0031540628635126884
0.07098859130100565
0.034340446341511614
-0.032074875591489076
0.0014718382531003565
-0.005448850304787233
4.388310652547925e-05
0.0

No handles with labels found to put in legend.


-0.014755727545702061
0.0017441582455694942
0.0026376991307540216
-0.0020735134201513294
0.0025511389610635383
0.014455394888523621
0.00017168954944149208
0.0024233053041635313
0.0026595750811268274
0.0017632461815492106
-0.00020297312251187265
-3.042813768503917e-05
0.0002997641200574093
0.008063186616309335
0.062079734286626956
-0.0733341324528786


In [7]:
#katy trying pca with pre-made dataset

#From Meyer: treatments as rows, genes as columns

#if already loaded you can keep this stuff commented
df_tpm = pd.read_csv('tpmdata.csv', delimiter = ',')
df_tpm = df_tpm.set_index('Unnamed: 0')

df_tpm, df_tpm_log = getPCA_DFs(groupby=groupby, topN=27311)

x = pca.fit_transform(df_tpm_log)
print(pca.components_)
print(pca.components_.shape)

print('''
    Variance for PCs = %s 
    Sum of variances = %.03f  
    pca.shape = %s where rows=loadings=%d, columns=PCs=%d''' 
      % (pca.explained_variance_ratio_, np.sum(pca.explained_variance_ratio_), 
         pca.components_.shape, pca.components_.shape[1], pca.components_.shape[0]))
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA log-transformed')

plt.scatter(pca.components_[0, :], pca.components_[1, :], label = 'PCA log transformed')
    
mplcursors.cursor(hover=True)
plt.legend(bbox_to_anchor=(1.35, 1)) 
plt.legend()
plt.show()



Unnamed: 0_level_0,Unnamed: 1_level_0,tpm
sampleID,gene_name,Unnamed: 2_level_1
35-TGFb-and-RA-high,MT-CO2,40095.437187
31-TGFb-high,MT-CO2,37089.321282
14-EtOH-highDensity,MT-CO2,34573.327005
20-EtOH-halfDensity,MT-CO2,30770.942552
36-RA-low,MT-CO2,30730.360023
...,...,...
32-RA-high,CDC42,195.066168
06-RA-high,EIF3A,195.058901
31-TGFb-high,MEA1,195.054882
51-RA-med,POLR2G,195.054585


[[-1.00284981e-03  5.62560688e-03 -1.94685402e-03 ... -2.02973123e-04
   2.99764120e-04  6.20797343e-02]
 [ 2.58075294e-02  1.49986708e-02  2.90956629e-03 ... -3.04281553e-05
   8.06318663e-03 -7.33341324e-02]
 [-1.01418122e-02 -3.38076763e-03 -7.51694712e-04 ...  4.57848124e-04
  -9.28825493e-03  8.40729697e-03]]
(3, 1011)

    Variance for PCs = [0.30219037 0.17461947 0.08014898] 
    Sum of variances = 0.557  
    pca.shape = (3, 1011) where rows=loadings=1011, columns=PCs=3


In [8]:
print(df_tpm)

x = pca.fit(df_tpm)
print(pca.components_)
print(pca.components_.shape[1])

print('''
    Variance for PCs = %s 
    Sum of variances = %.03f  
    pca.shape = %s where rows=loadings=%d, columns=PCs=%d''' 
      % (pca.explained_variance_ratio_, np.sum(pca.explained_variance_ratio_), 
         pca.components_.shape, pca.components_.shape[1], pca.components_.shape[0]))
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA')

plt.scatter(pca.components_[0, :], pca.components_[1, :], label = 'PCA not log transformed')
    
mplcursors.cursor(hover=True)
plt.legend(bbox_to_anchor=(1.35, 1)) 
plt.show()

                          RPS26       DRAP1        CYC1      S100A14  \
08-TGFb-and-RA-low   214.631057  208.885263  336.483304   505.353671   
36-RA-low            236.776206  209.198444  375.910873  1034.059946   
31-TGFb-high         207.396995    0.000000  284.778410  1067.461766   
15-EtOH-nlDensity    215.538645  204.514074  405.834704  1520.366329   
22-TGFb-high         222.024840  242.437668  373.464508  1376.862078   
12-RA-med            213.013317    0.000000  360.264863   590.785388   
24-RA-low            249.903958  218.964441  418.241741   904.802745   
35-TGFb-and-RA-high  197.770121    0.000000  277.325152   203.035473   
09-TGFb-and-RA-med   206.245966  199.471753  313.200933   351.467675   
05-EtOH-nlDensity      0.000000    0.000000  368.104695  1451.547799   
03-RA-low            216.642528  195.050371  369.853012   944.222272   
46-EtOH-nlDensity    232.663661  240.737238  446.915381  1512.932580   
52-TGFb-and-RA-med   235.622147  257.663718  382.015068   289.76