In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sb
from sklearn.decomposition import PCA
from sklearn import preprocessing

In [2]:
df = pd.read_table("geno.frq.rerun.output.noquestion", header=None)
print(df)

           0                   1       2
0        CEU   CYP2B6*1/CYP2B6*1  32.320
1        CEU  CYP2B6*1/CYP2B6*11   1.010
2        CEU  CYP2B6*1/CYP2B6*15   1.010
3        CEU   CYP2B6*1/CYP2B6*2   5.051
4        CEU  CYP2B6*1/CYP2B6*22   1.010
..       ...                 ...     ...
447  KHV206g   CYP4F2*1/CYP4F2*1  62.140
448  KHV206g   CYP4F2*1/CYP4F2*2   9.223
449  KHV206g   CYP4F2*1/CYP4F2*3  20.870
450  KHV206g   CYP4F2*2/CYP4F2*3   2.427
451  KHV206g   CYP4F2*3/CYP4F2*3   3.398

[452 rows x 3 columns]


In [3]:
df.columns = ['Ethnic', 'Alleles', 'Percentage']
print(df)

      Ethnic             Alleles  Percentage
0        CEU   CYP2B6*1/CYP2B6*1      32.320
1        CEU  CYP2B6*1/CYP2B6*11       1.010
2        CEU  CYP2B6*1/CYP2B6*15       1.010
3        CEU   CYP2B6*1/CYP2B6*2       5.051
4        CEU  CYP2B6*1/CYP2B6*22       1.010
..       ...                 ...         ...
447  KHV206g   CYP4F2*1/CYP4F2*1      62.140
448  KHV206g   CYP4F2*1/CYP4F2*2       9.223
449  KHV206g   CYP4F2*1/CYP4F2*3      20.870
450  KHV206g   CYP4F2*2/CYP4F2*3       2.427
451  KHV206g   CYP4F2*3/CYP4F2*3       3.398

[452 rows x 3 columns]


In [4]:
split_columns = df['Alleles'].str.split('/', expand=True)

# Separate the split columns into 'Allele', 'Star Allele 1', and 'Star Allele 2'
df['Allele'] = split_columns[0].str.split('*', expand=True)[0]
df['Star Allele 1'] = split_columns[0].str.split('*', expand=True)[1]
df['Star Allele 2'] = split_columns[1].str.split('*', expand=True)[1]

df['Percentage per allele'] = df['Percentage'] / 2
# Rename columns
df.columns = ['Ethnic', 'Alleles', 'Percentage', 'Gene', 'Star Allele 1', 'Star Allele 2','Percentage per allele']

# Print the modified DataFrame
print(df)
df.to_excel('df.xlsx', index=False)

      Ethnic             Alleles  Percentage    Gene Star Allele 1  \
0        CEU   CYP2B6*1/CYP2B6*1      32.320  CYP2B6             1   
1        CEU  CYP2B6*1/CYP2B6*11       1.010  CYP2B6             1   
2        CEU  CYP2B6*1/CYP2B6*15       1.010  CYP2B6             1   
3        CEU   CYP2B6*1/CYP2B6*2       5.051  CYP2B6             1   
4        CEU  CYP2B6*1/CYP2B6*22       1.010  CYP2B6             1   
..       ...                 ...         ...     ...           ...   
447  KHV206g   CYP4F2*1/CYP4F2*1      62.140  CYP4F2             1   
448  KHV206g   CYP4F2*1/CYP4F2*2       9.223  CYP4F2             1   
449  KHV206g   CYP4F2*1/CYP4F2*3      20.870  CYP4F2             1   
450  KHV206g   CYP4F2*2/CYP4F2*3       2.427  CYP4F2             2   
451  KHV206g   CYP4F2*3/CYP4F2*3       3.398  CYP4F2             3   

    Star Allele 2  Percentage per allele  
0               1                16.1600  
1              11                 0.5050  
2              15             

In [8]:
cyp2b6 = pd.read_csv('frq/cyp2b6.csv', header=None)
cyp2c9 = pd.read_csv('frq/cyp2c9.csv', header=None)
cyp2c19 = pd.read_csv('frq/cyp2c19.csv', header=None)
cyp2d6 = pd.read_csv('frq/cyp2d6.csv', header=None)
cyp3a5 = pd.read_csv('frq/cyp3a5.csv', header=None)
cyp4f2 = pd.read_csv('frq/cyp4f2.csv', header = None)

final_table = pd.concat([cyp2b6, cyp2c9, cyp2c19, cyp2d6, cyp3a5,cyp4f2])
final_table.to_excel('result.xlsx')
print(final_table)

     0        1     2     3     4     5     6     7     8
0  NaN   CYP2B6   KHV   CEU   CHB   CHS   JPT   SAS   YRI
1  0.0        1  66.5  55.0  80.6  80.9  73.1  47.0  38.4
2  1.0        2   5.8   4.0   2.4   3.3   2.9   4.1   3.7
3  2.0        6  25.5  26.8  14.6  15.2  18.7  37.4  39.3
0  NaN   CYP2C9   KHV   CEU   CHB   CHS   JPT   SAS   YRI
1  0.0        1  94.4  76.3  94.7  93.8  97.6  82.6  77.8
2  1.0        3   2.9   6.6   3.9   4.8   1.9  10.8   0.0
0  NaN  CYP2C19   KHV   CEU   CHB   CHS   JPT   SAS   YRI
1  0.0        1  64.6  63.1  59.2  59.0  60.1  47.9  49.1
2  1.0        2  25.7  13.1  33.5  35.2  32.2  35.7  16.7
3  2.0        3   1.5   0.0   4.4   4.8   7.2   1.2   0.0
4  3.0       17   4.1  22.2   2.4   1.0   0.5  13.6  24.5
0  NaN   CYP2D6   KHV   CEU   CHB   CHS   JPT   SAS   YRI
1  0.0        1  24.8  39.9  21.4  22.9  50.0  40.5  26.4
2  1.0        2  10.2  13.6  12.1   8.6  13.0  21.8  13.9
3  2.0       10  51.5   1.5  56.8  59.5  36.1   5.0   4.6
4  3.0       1

In [9]:
gst_cyp2b6 = pd.read_csv('gst/gst_cyp2b6.csv', header=None)
gst_cyp2c9 = pd.read_csv('gst/gst_cyp2c9.csv', header=None)
gst_cyp2c19 = pd.read_csv('gst/gst_cyp2c19.csv', header=None)
gst_cyp2d6 = pd.read_csv('gst/gst_cyp2d6.csv', header=None)
gst_cyp3a5 = pd.read_csv('gst/gst_cyp3a5.csv', header=None)
gst_cyp4f2 = pd.read_csv('gst/gst_cyp4f2.csv', header = None)

gst = pd.concat([gst_cyp2b6, gst_cyp2c9, gst_cyp2c19, gst_cyp2d6, gst_cyp3a5,gst_cyp4f2])
gst.to_excel('gst.xlsx')
print(gst)

         0       1       2       3       4       5       6
0      NaN     CEU     CHB     CHS     JPT     SAS     YRI
1   CYP2B6  0.0061  0.0197  0.0193  0.0053  0.0228  0.0398
0      NaN     CEU     CHB     CHS     JPT     SAS     YRI
1   CYP2C9  0.0317  0.0003  0.0009  0.0036  0.0238  0.0275
0      NaN     CEU     CHB     CHS     JPT     SAS     YRI
1  CYP2C19  0.0228  0.0048  0.0068  0.0051  0.0202  0.0302
0      NaN     CEU     CHB     CHS     JPT     SAS     YRI
1   CYP2D6  0.0874  0.0019  0.0032  0.0341  0.0837   0.082
0      NaN     CEU     CHB     CHS     JPT     SAS     YRI
1   CYP3A5  0.1167   0.001  0.0019  0.0074  0.0005  0.1088
0      NaN     CEU     CHB     CHS     JPT     SAS     YRI
1   CYP4F2  0.0059  0.0006  0.0008  0.0004  0.0296  0.0315
