# Processing and Imputation
The purpose of this Jupyter Notebook is to run the same preprocessing steps and MAGIC imputation on all 7 files from Farhadian, 2018.

### Part 0: Importing Packages

In [16]:
# import statements
import magic
import scprep

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import gzip
import csv

from sklearn.manifold import TSNE
from scprep.io.csv import load_csv

### Part 1: Locating Data

In [17]:
path = '../../data/raw_data/'
file_names = ['HIV1_Bld.csv', 'HIV1_CSF.csv', 'HIV2_Bld.csv', 'HIV2_CSF.csv','HIV3_CSF.csv', 'Uninfected1_CSF.csv', 'Uninfected2_CSF.csv']

### Part 2: Filtering and Performing MAGIC

In [18]:
# loop through all the samples and compute MAGIC
for i in range(0, 7):
    
    print(file_names[i])
    file_name = file_names[i]
    dat = scprep.io.load_csv(path + file_name)
    dat = dat.transpose()
    
    # Remove empty columns and rows
    dat = scprep.filter.filter_empty_cells(dat)
    dat = scprep.filter.filter_empty_genes(dat)
    
    # Filter out genes expressed in less than 3 cells
    dat = scprep.filter.filter_rare_genes(dat, cutoff=0, min_cells=3)
    
    # Filter by library size anything above 500- keep
    dat = scprep.filter.filter_library_size(dat, cutoff=500, keep_cells = 'above')
    
    #normalize and transform data
    dat = scprep.normalize.library_size_normalize(dat)
    dat = scprep.transform.sqrt(dat)
    dat.head()
    
    # save data after preprocessing and before MAGIC
    out_path = '../../data/filtered_data/'
    out_name = out_path + file_name
    dat.to_csv(out_name)
    
    # do magic
    magic_op = magic.MAGIC()
    dat_magic=magic_op.fit_transform(dat)
    
    # save csv file
    out_path = '../../data/imputation_intermediate/magic_all_samples/'
    out_name = out_path + file_name
    dat_magic.to_csv(out_name)

HIV1_Bld.csv
Calculating MAGIC...
  Running MAGIC on 2830 cells and 22935 genes.
  Calculating graph and diffusion operator...




    Calculating PCA...
    Calculated PCA in 6.26 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.99 seconds.
    Calculating affinities...
    Calculated affinities in 0.93 seconds.
  Calculated graph and diffusion operator in 8.58 seconds.
  Calculating imputation...
    Automatically selected t = 11
  Calculated imputation in 0.69 seconds.
Calculated MAGIC in 10.54 seconds.
HIV1_CSF.csv
Calculating MAGIC...
  Running MAGIC on 1262 cells and 18507 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...




    Calculated PCA in 2.12 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.21 seconds.
    Calculating affinities...
    Calculated affinities in 0.20 seconds.
  Calculated graph and diffusion operator in 2.65 seconds.
  Calculating imputation...
    Automatically selected t = 11
  Calculated imputation in 0.30 seconds.
Calculated MAGIC in 3.38 seconds.
HIV2_Bld.csv
Calculating MAGIC...
  Running MAGIC on 4024 cells and 22306 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 6.79 seconds.
    Calculating KNN search...
    Calculated KNN search in 1.93 seconds.
    Calculating affinities...
    Calculated affinities in 2.94 seconds.
  Calculated graph and diffusion operator in 12.23 seconds.
  Calculating imputation...
    Automatically selected t = 12
  Calculated imputation in 1.63 seconds.
Calculated MAGIC in 15.67 seconds.
HIV2_CSF.csv
Calculating MAGIC...
  Running MAGIC on 3048 cells and 21918 genes.
  Calculati



    Calculating PCA...
    Calculated PCA in 5.10 seconds.
    Calculating KNN search...
    Calculated KNN search in 1.11 seconds.
    Calculating affinities...
    Calculated affinities in 2.00 seconds.
  Calculated graph and diffusion operator in 8.67 seconds.
  Calculating imputation...
    Automatically selected t = 15
  Calculated imputation in 1.52 seconds.
Calculated MAGIC in 11.46 seconds.
HIV3_CSF.csv
Calculating MAGIC...
  Running MAGIC on 2401 cells and 21034 genes.
  Calculating graph and diffusion operator...




    Calculating PCA...
    Calculated PCA in 3.84 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.68 seconds.
    Calculating affinities...
    Calculated affinities in 1.25 seconds.
  Calculated graph and diffusion operator in 6.15 seconds.
  Calculating imputation...
    Automatically selected t = 13
  Calculated imputation in 1.19 seconds.
Calculated MAGIC in 8.24 seconds.
Uninfected1_CSF.csv
Calculating MAGIC...
  Running MAGIC on 400 cells and 18323 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...




    Calculated PCA in 0.81 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.03 seconds.
    Calculating affinities...
    Calculated affinities in 0.01 seconds.
  Calculated graph and diffusion operator in 0.89 seconds.
  Calculating imputation...
    Automatically selected t = 11
  Calculated imputation in 0.09 seconds.
Calculated MAGIC in 1.16 seconds.
Uninfected2_CSF.csv
Calculating MAGIC...
  Running MAGIC on 672 cells and 18054 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...




    Calculated PCA in 0.98 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.08 seconds.
    Calculating affinities...
    Calculated affinities in 0.08 seconds.
  Calculated graph and diffusion operator in 1.18 seconds.
  Calculating imputation...
    Automatically selected t = 12
  Calculated imputation in 0.18 seconds.
Calculated MAGIC in 1.61 seconds.
