# Processing and Imputation
The purpose of this Jupyter Notebook is to run the same preprocessing steps and MAGIC imputation on all 7 files from Farhadian, 2018.

### Part 0: Importing Packages

In [1]:
# import statements
import magic
import scprep

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import gzip
import csv

from sklearn.manifold import TSNE
from scprep.io.csv import load_csv

### Part 1: Locating Data

In [2]:
path = '../../data/raw_data/'
file_names = ['HIV1_Bld.csv', 'HIV1_CSF.csv', 'HIV2_Bld.csv', 'HIV2_CSF.csv','HIV3_CSF.csv', 'Uninfected1_CSF.csv', 'Uninfected2_CSF.csv']

### Part 2: Filtering and Performing MAGIC

In [5]:
# loop through all the samples and compute MAGIC
for i in range(0, 7):
    
    print(file_names[i])
    file_name = file_names[i]
    dat = scprep.io.load_csv(path + file_name)
    dat = dat.transpose()
    
    # Remove empty columns and rows
    dat = scprep.filter.filter_empty_cells(dat)
    dat = scprep.filter.filter_empty_genes(dat)
    
    # Filter out genes expressed in less than 3 cells
    dat = scprep.filter.filter_rare_genes(dat, cutoff=0, min_cells=3)
    
    # Filter by library size anything above 500- keep
    dat = scprep.filter.filter_library_size(dat, cutoff=500, keep_cells = 'above')
    
    # Filter by library size anything below 2500- keep
    dat = scprep.filter.filter_library_size(dat, cutoff=2500, keep_cells = 'below')

    # Filter by mitochondrial genes- anything with >90% mt genes- discard
    mt_genes = scprep.select.get_gene_set(dat, starts_with="MT")
    dat = scprep.filter.filter_gene_set_expression(dat, genes=mt_genes,
                                                percentile=90)
    
    #normalize and transform data
    dat = scprep.normalize.library_size_normalize(dat)
    dat = scprep.transform.sqrt(dat)
    dat.head()
    
    # save data after preprocessing and before MAGIC
    out_path = '../../data/imputation_intermediate/filtered_data/'
    out_name = out_path + file_name
    dat.to_csv(out_name)
    
    # do magic
    magic_op = magic.MAGIC()
    dat_magic=magic_op.fit_transform(dat)
    
    # save csv file
    out_path = '../../data/imputation_intermediate/magic_all_samples/'
    out_name = out_path + file_name
    dat_magic.to_csv(out_name)

HIV1_Bld.csv
Calculating MAGIC...
  Running MAGIC on 1669 cells and 22935 genes.
  Calculating graph and diffusion operator...




    Calculating PCA...
    Calculated PCA in 4.42 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.54 seconds.
    Calculating affinities...
    Calculated affinities in 0.53 seconds.
  Calculated graph and diffusion operator in 5.77 seconds.
  Calculating imputation...
    Automatically selected t = 13
  Calculated imputation in 0.79 seconds.
Calculated MAGIC in 7.63 seconds.
HIV1_CSF.csv
Calculating MAGIC...
  Running MAGIC on 756 cells and 18507 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...




    Calculated PCA in 1.07 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.09 seconds.
    Calculating affinities...
    Calculated affinities in 0.10 seconds.
  Calculated graph and diffusion operator in 1.30 seconds.
  Calculating imputation...
    Automatically selected t = 10
  Calculated imputation in 0.17 seconds.
Calculated MAGIC in 1.75 seconds.
HIV2_Bld.csv
Calculating MAGIC...
  Running MAGIC on 2825 cells and 22306 genes.
  Calculating graph and diffusion operator...




    Calculating PCA...
    Calculated PCA in 6.63 seconds.
    Calculating KNN search...
    Calculated KNN search in 1.18 seconds.
    Calculating affinities...
    Calculated affinities in 2.07 seconds.
  Calculated graph and diffusion operator in 10.40 seconds.
  Calculating imputation...
    Automatically selected t = 11
  Calculated imputation in 1.28 seconds.
Calculated MAGIC in 13.33 seconds.
HIV2_CSF.csv
Calculating MAGIC...
  Running MAGIC on 1964 cells and 21918 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...




    Calculated PCA in 3.85 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.55 seconds.
    Calculating affinities...
    Calculated affinities in 1.00 seconds.
  Calculated graph and diffusion operator in 5.66 seconds.
  Calculating imputation...
    Automatically selected t = 17
  Calculated imputation in 1.21 seconds.
Calculated MAGIC in 7.75 seconds.
HIV3_CSF.csv
Calculating MAGIC...
  Running MAGIC on 1476 cells and 21034 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...




    Calculated PCA in 2.93 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.36 seconds.
    Calculating affinities...
    Calculated affinities in 0.57 seconds.
  Calculated graph and diffusion operator in 4.01 seconds.
  Calculating imputation...
    Automatically selected t = 12
  Calculated imputation in 0.55 seconds.
Calculated MAGIC in 5.19 seconds.
Uninfected1_CSF.csv
Calculating MAGIC...
  Running MAGIC on 268 cells and 18323 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...




    Calculated PCA in 0.56 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.02 seconds.
    Calculating affinities...
  Calculated graph and diffusion operator in 0.60 seconds.
  Calculating imputation...
    Automatically selected t = 7
  Calculated imputation in 0.05 seconds.
Calculated MAGIC in 0.76 seconds.
Uninfected2_CSF.csv
Calculating MAGIC...
  Running MAGIC on 461 cells and 18054 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...




    Calculated PCA in 0.77 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.04 seconds.
    Calculating affinities...
    Calculated affinities in 0.03 seconds.
  Calculated graph and diffusion operator in 0.87 seconds.
  Calculating imputation...
    Automatically selected t = 9
  Calculated imputation in 0.08 seconds.
Calculated MAGIC in 1.13 seconds.
