<a href="https://colab.research.google.com/github/lacey-conrad/WGCNA-work/blob/main/Gene_exp_baseNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split


from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LeakyReLU

from sklearn.decomposition import PCA  # to apply PCA

# 1. Reading in the dataset:



In [2]:
df_gastroc = pd.read_csv('gastroc_raw_counts.csv')

In [3]:
#df_copy = df.copy()

# 2. Data preprocessing:

In [4]:
df_gastroc['Row_Mean'] = df_gastroc.mean(axis=1)

  """Entry point for launching an IPython kernel.


In [5]:
df_gastroc = df_gastroc[df_gastroc.Row_Mean >= 10]

In [6]:
# Keep!!!
df_gastroc.drop("Row_Mean", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [7]:
df_gastroc = df_gastroc.T

In [8]:
df_gastroc.columns = df_gastroc.iloc[0]

In [9]:
df_gastroc = df_gastroc.iloc[1: , :]

In [10]:
df_gastroc = df_gastroc.reset_index()

In [11]:
df_gastroc = df_gastroc.rename(columns={'index': 'Treatment'})

In [12]:
df_gastroc['Treatment'] = df_gastroc['Treatment'].str.split('.').str[0]

In [13]:
cleanup_nums = {"Treatment": {"LN": 0, "ME": 1}}

In [14]:
# Organization of df complete with this .head()
df_gastroc = df_gastroc.replace(cleanup_nums)
df_gastroc.head()

Unnamed: 0,Treatment,LOC102903073,LOC102904318,Ppp2r1a,LOC102906476,LOC102922856,LOC102912116,Ppp4r2,Myadm,LOC107402623,...,EI127_mgp04,EI127_mgt17,EI127_mgt18,EI127_mgt19,EI127_mgp03,EI127_mgp02,EI127_mgt20,EI127_mgp01,EI127_mgt21,EI127_mgt22
0,0,10,27,336,384,6,13,33,83,23,...,11886,276,122,92,6169,5420,2144,16304,173,1511
1,0,12,35,264,340,13,7,51,36,39,...,12638,371,207,81,7024,6495,1673,16561,142,1280
2,0,5,21,175,407,21,10,43,43,30,...,9748,272,135,67,5841,5400,1414,13018,103,955
3,0,6,25,298,446,21,16,54,61,23,...,16001,495,230,116,9873,7522,2011,21779,196,1599
4,0,7,25,425,526,17,19,58,132,51,...,13241,337,180,91,7773,6657,1650,16713,109,1130


In [15]:
####################################
# Keep all code above!!! ###########
####################################

# 3. 
# y = target variable (high versus low altitude population)
#    X = predictor variables (genes -> PCA)

In [16]:
X = df_gastroc.drop(labels=['Treatment'], axis = 1)
y = df_gastroc.Treatment

In [17]:
y = y.astype(int)

In [18]:
#genes = list(X.columns)

In [19]:
#genes

# 4. Normalization

In [20]:
scaler = StandardScaler()
scaler.fit(X)
X_trans = scaler.transform(X)

In [21]:
X_trans

array([[-0.12377096,  0.57329925,  0.55214901, ...,  0.24854285,
         0.75002858,  2.0639271 ],
       [ 0.23980624,  1.66198874, -0.35443626, ...,  0.3339525 ,
         0.3855217 ,  1.21284819],
       [-1.03271396, -0.24321786, -1.47507639, ..., -0.84350423,
        -0.07305148,  0.01544281],
       ...,
       [-0.85092536, -0.51539024, -0.32925334, ..., -1.25360347,
        -0.66096581, -1.49881447],
       [-0.12377096, -1.33190735, -1.10992399, ..., -0.73682526,
        -0.4140418 , -0.75089664],
       [ 1.51232643, -0.10713168,  0.17440515, ..., -0.47959932,
        -0.73151553, -0.79510853]])

# 5. Apply PCA for dimensionality reduction

In [27]:
#Applying PCA
#Taking no. of Principal Components as 3
pca = PCA(n_components = 3)
pca.fit(X_trans)

PCA(n_components=3)

In [28]:
data_pca = pca.transform(X_trans)
data_pca = pd.DataFrame(data_pca,columns=['PC1','PC2','PC3'])
data_pca.head()

Unnamed: 0,PC1,PC2,PC3
0,-44.117166,26.196736,6.949403
1,-47.573065,-12.572122,-11.710366
2,-72.231403,-6.664256,-6.881616
3,17.838188,-20.30777,3.901216
4,73.31104,2.032396,-22.95701


In [29]:
#sns.heatmap(data_pca.corr(), cmap="YlGnBu")

In [30]:
X = data_pca

In [31]:
###############################################

# 6. Split data into a training and testing set 

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)

In [33]:
X_train  = X_train.astype(int)
X_test  = X_test.astype(int)

In [34]:
X.shape

(47, 3)

In [35]:
#X

# 7. Create the NN model

In [36]:
model = tf.keras.models.Sequential()

Hyperparameters:

number of neurons: 128

activation function: ReLU

input_shape: (None,47,5 )

In [37]:
model.add(tf.keras.layers.Dense(units = 128, activation = 'relu', input_shape = (None, 47,3)))


Add a dropout layer

In [38]:
model.add(tf.keras.layers.Dropout(0.2))

Add an output layer

In [39]:
model.add(tf.keras.layers.Dense(units = 10, activation = 'softmax'))

Model compiler

In [40]:
model.compile(optimizer='adam',loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


In [41]:
#model.fit(X_train, y_train, epochs =5)

Training the ANN

In [42]:
# Training the ANN:
model.fit(X_train, y_train, epochs = 10, batch_size = 10, verbose=1)

Epoch 1/10


  return dispatch_target(*args, **kwargs)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f082f254550>

# 8. Testing the accuracy of the NN

In [43]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)

  return dispatch_target(*args, **kwargs)




In [44]:
test_loss

0.11995889246463776

In [45]:
test_accuracy

0.8999999761581421

###########################

## ** Task 2: Predicting Treatment**