# pandas

In [None]:
# https://www.learndatasci.com/tutorials/python-pandas-tutorial-complete-introduction-for-beginners/

# https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html#min

In [None]:
import pandas as pd

In [None]:
tissue = {'ENCFF016CBS': 'omental fat pad', 'ENCFF505TUS': 'prostate gland'}

In [None]:
URL1 = 'https://www.encodeproject.org/files/ENCFF016CBS/@@download/ENCFF016CBS.tsv'
df1 = pd.read_csv(URL1, sep='\t')
df1.head()

In [None]:
URL2 = 'https://www.encodeproject.org/files/ENCFF505TUS/@@download/ENCFF505TUS.tsv'
df2 = pd.read_csv(URL2, sep='\t')
df2.head()

In [None]:
columns1 = df1.columns
values1 = df1.values

In [None]:
df1.describe()

In [None]:
df2.describe()

In [None]:
df2['TPM'].describe()

In [None]:
df1.info()

In [None]:
list(df1.columns)

In [None]:
df1_1 = df1[['length', 'effective_length', 'expected_count', 'gene_id', 'transcript_id(s)', 'TPM']]

In [None]:
df1_1.head()

In [None]:
df1_1.tail()

In [None]:
df1_1['gene_id'].str.startswith('ENGS')

In [None]:
df1_2 = df1_1[(df1_1.TPM > 0.5) & (df1_1.gene_id.str.startswith('ENSG'))]

df1_2.info()
df1_2.head()

In [None]:
df1_2 = df1_2.drop(columns=['length', 'effective_length', 'expected_count'])

In [None]:
df1_2.head()

In [None]:
df1_2.loc[df1_2['gene_id'] == 'ENSG00000000419.12']

In [None]:
interesting_genes = ['ENSG00000000457.13', 'ENSG00000000460.16', 'ENSG00000000005.5']
df1_2[df1_2.gene_id.isin(interesting_genes)]

In [None]:
# get the max and min TPM
print('max TPM:', df1_2['TPM'].max(), '\nmin TPM:', df1_2['TPM'].min())

In [None]:
list(df1_2.columns)

In [None]:
df2_1 = df2[['gene_id', 'transcript_id(s)', 'TPM']]

In [None]:
df2_2 = df2_1[(df2_1.TPM > 0.5) & (df2_1.gene_id.str.startswith('ENSG'))]
df2_2.head()

In [None]:
# get the max and min TPM
print('max TPM:', df2_2['TPM'].max(), '\nmin TPM:', df2_2['TPM'].min())

In [None]:
merge_df = pd.merge(df1_2, df2_2, on='gene_id', how='inner') # intersection -- default
merge2_df = pd.merge(df1_2, df2_2, on='gene_id', how='outer') # union

In [None]:
merge_df.head()

In [None]:
merge2_df.head()

In [None]:
merge_df.describe()

In [None]:
merge_df.info()

In [None]:
# merge_df[merge_df.gene_id.isin(interesting_genes)]

In [None]:
cutoff = 500

merge_df2 = merge_df
find_df = merge_df[(merge_df.TPM_x > cutoff)]

In [None]:
find_df

In [None]:
merge_df2 = merge_df[(merge_df.TPM_x < cutoff) & (merge_df.TPM_y < cutoff)]

merge_df2.plot(kind='scatter', x='TPM_x', y='TPM_y', title='ENCFF016CBS vs ENCFF505TUS');

In [None]:
x_values = list(merge_df2['TPM_x'])
y_values = list(merge_df2['TPM_y'])

In [None]:
import math

x_plot = []
y_plot = []
colors = []
    
for x in range(len(x_values)):
    if math.log10(x_values[x] / y_values[x]) > 0 or math.log10(y_values[x] / x_values[x]) > 0:
        x_plot.append(x_values[x])
        y_plot.append(y_values[x])
        
        if (x_values[x] / y_values[x]) > 2.0 or (y_values[x] / x_values[x]) > 2.0:
            colors.append('red')
        else:
            colors.append('purple')

In [None]:
import matplotlib.pyplot as plt

In [None]:
s = 2
fig, ax = plt.subplots()
im = ax.scatter(x_plot, y_plot, c=colors, s=s)

plt.xscale('log')
plt.yscale('log')

x_label = 'ENCFF016CBS'
y_label = 'ENCFF505TUS'

plt.title( x_label + ' vs ' + y_label )
plt.grid(True)

plt.xlabel(tissue[x_label] + '  (TPM)')
plt.ylabel(tissue[y_label] + ' (TPM)')

plt.axis([1, cutoff, 1, cutoff])
plt.show()