# In class exercises - Intro to Pandas Series and DataFrames

## Import libs

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
cwd = os.getcwd()

## First import 'response_time_data.csv' data file
* Contains RTs from 800 trials of a simple detection task from each of 20 subjects
* Organizing into a DataFrame and then saved out in csv format
* The index (row) and column labels are encoded in the csv file, so you'll need to read those in explcitly
* Make sure to have a look at the DataFrame - use the df.head() function

In [4]:
file_name = cwd + '/response_time_data.csv'
df = pd.read_csv(file_name, index_col=0, header=0)
df.head()

Unnamed: 0,Sub0,Sub1,Sub2,Sub3,Sub4,Sub5,Sub6,Sub7,Sub8,Sub9,Sub10,Sub11,Sub12,Sub13,Sub14,Sub15,Sub16,Sub17,Sub18,Sub19
Tri0,2797.22424,1039.571212,4045.345952,3530.93421,2410.276348,6541.494156,1977.919842,2343.555594,143.695964,8147.939691,5183.942423,4548.240971,2076.921296,4230.548795,4134.589984,2067.132295,4087.049471,2704.327437,2790.476384,5141.106292
Tri1,786.895089,3076.223066,1033.310418,3758.043454,4000.805778,2756.802996,2918.768116,2613.934992,2655.684434,7410.337807,3182.903975,4324.103096,1843.506277,1338.453235,2693.772203,7239.094853,1320.715043,4449.372349,1085.884483,3556.231671
Tri2,3516.902396,4632.818016,4874.066155,3031.377402,2485.677228,4929.841314,435.950399,3059.241733,2923.3256,3530.389021,3002.555229,7537.781867,1989.249165,4513.510928,4473.73304,7422.364759,3338.164717,4840.676786,2721.343095,1972.689272
Tri3,333.88183,104.448476,2304.093856,586.098266,4575.178155,2365.682721,1285.101296,5050.566343,2446.870606,5096.855057,1047.603006,5431.187785,2879.554454,311.31906,2814.385809,3396.500194,1324.780081,1518.991979,1676.395223,2051.924695
Tri4,6790.330061,2629.751046,3148.222058,1894.867975,2274.057485,8186.457041,1195.253881,3747.385847,1456.694541,3437.159878,6745.578676,4101.871682,1944.773775,1571.942134,3186.806328,6588.562378,2866.277989,2079.88084,1086.063139,7051.740732


## Now have a look at the data using built in Padas functionality
* Check out the max/min of each row, standard deviation, percentiles, etc.

In [34]:
d = df.describe()
d['Sub0'][1]
print('Max:')
print(df.max(axis=1))

print('\n Min:')
print(df.min(axis=1))

print('\n Std:')
print(df.std(axis=1))

print('\n 25% Percentiles')
print(df.describe(percentiles=np.linspace(0,1,11)))


Max:
Tri0       8147.939691
Tri1       7410.337807
Tri2       7537.781867
Tri3       5431.187785
Tri4       8186.457041
Tri5       5268.016210
Tri6       6009.945843
Tri7       7505.781800
Tri8       5277.547086
Tri9       5802.017010
Tri10      9493.147896
Tri11      6855.044510
Tri12      7079.723259
Tri13      7583.853123
Tri14      8460.181684
Tri15      6603.612971
Tri16      6352.487413
Tri17      7033.016570
Tri18      6076.972257
Tri19      6875.100886
Tri20      7175.587179
Tri21      9126.489066
Tri22      6952.390799
Tri23      6783.466281
Tri24     10061.049263
Tri25      6404.474740
Tri26      6221.959882
Tri27      7500.719769
Tri28      6439.059402
Tri29      6449.023514
              ...     
Tri770     8775.582668
Tri771     9753.921246
Tri772     8765.415950
Tri773     8407.932097
Tri774     7119.668571
Tri775     9133.851062
Tri776     4466.085911
Tri777     8827.325748
Tri778     6830.768774
Tri779     9756.056048
Tri780     9967.211069
Tri781     6253.286042
Tri782

## Are there missing values (NaNs) in the data?

In [40]:
print(np.sum(np.isnan(df), axis=0))
if np.isnan(df).any:
    print('\nThere are NaNs in the data')

Sub0      0
Sub1      0
Sub2      0
Sub3      0
Sub4      4
Sub5      0
Sub6      0
Sub7      1
Sub8      0
Sub9      2
Sub10     0
Sub11    11
Sub12     0
Sub13     3
Sub14     3
Sub15     0
Sub16     0
Sub17    15
Sub18     7
Sub19     0
dtype: int64

There are NaNs in the data


## What about outliers? Lets define outliers here as > 2 * std away from the mean for each subject
* After you've found the outliers for each subject, replace those values with a np.nan (NaN)

In [41]:
print(d)

               Sub0         Sub1         Sub2         Sub3         Sub4  \
count    800.000000   800.000000   800.000000   800.000000   796.000000   
mean    3492.614323  2549.787915  2498.108943  3502.338174  2489.637962   
std     1779.474153  1476.122674  1434.749989  1722.695784  1394.508376   
min       14.891233    14.197194    28.100837    23.429444     4.830230   
25%     2248.698649  1453.328596  1444.344991  2175.921827  1414.327797   
50%     3356.267518  2332.504240  2378.456796  3425.256242  2331.301722   
75%     4590.539997  3572.853241  3365.725589  4656.169105  3418.744846   
max    10681.396388  9612.953879  7980.653894  9126.489066  7711.685320   

               Sub5         Sub6          Sub7          Sub8          Sub9  \
count    800.000000   800.000000    799.000000    800.000000    798.000000   
mean    4583.557298  2587.373753   3528.493482   1587.012676   4367.761563   
std     2544.771595  1529.182544   2000.548574   1302.153904   1935.519959   
min      136

## After you've found the outliers and replaced with NaNs for each subject, check out this function:
[pandas.DataFrame.interpolate](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.interpolate.html#pandas.DataFrame.interpolate)

* Use this function to interpolate the missing values for each subject (do not interpolate across subjects!)
* Just use linear interpolation...

## You can explore the "Missing Values" page for Pandas to figure out other ways of filling in missing values and outliers

[page is here](https://pandas.pydata.org/pandas-docs/stable/missing_data.html#missing-data)

* Figure out how to replace the outliers with the mean of each subject

## Use the Pandas.DataFrame.Sample function to generate bootstrapped confidence intervals for the data from subject 11

[see this page for Samples](https://pandas.pydata.org/pandas-docs/version/0.22/generated/pandas.DataFrame.sample.html)


* Resample Sub11's data with replacement, each time pulling N samples (800 in this case)
* Generate a distribution of means across all resamples
* Compute 95% confidence intervals using:

[this page for quantiles](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.quantile.html)