# In class exercises - Intro to Pandas Series and DataFrames

## Import libs

In [4]:
import pandas as pd
from pandas import DataFrame, read_csv
import numpy as np
import matplotlib.pyplot as plt

import os
cwd = os.getcwd()

## First import 'response_time_data.csv' data file
* Contains RTs from 800 trials of a simple detection task from each of 20 subjects
* Organizing into a DataFrame and then saved out in csv format
* The index (row) and column labels are encoded in the csv file, so you'll need to read those in explcitly
* Make sure to have a look at the DataFrame - use the df.head() function

In [9]:
file_name = cwd + '/response_time_data.csv'

#Creating the data frame
df = pd.read_csv(file_name, index_col=0, header=0)
df.head()

df.to_csv('response_time_data.csv',index=True,header=True)


## Now have a look at the data using built in Padas functionality
* Check out the max/min of each row, standard deviation, percentiles, etc.

In [22]:
df.describe()

#Percentiles
df.describe(percentiles=np.linspace(0,1,11))

#Max/Min
print("Max: \n ")
print("\n", df.max(axis=1))

print("Min: \n ")
print("\n", df.min(axis=1))

#Standard deviation
print("Standard Deviation: \n")
print(df.std(axis=1))

Max: 
 

 Tri0       8147.939691
Tri1       7410.337807
Tri2       7537.781867
Tri3       5431.187785
Tri4       8186.457041
Tri5       5268.016210
Tri6       6009.945843
Tri7       7505.781800
Tri8       5277.547086
Tri9       5802.017010
Tri10      9493.147896
Tri11      6855.044510
Tri12      7079.723259
Tri13      7583.853123
Tri14      8460.181684
Tri15      6603.612971
Tri16      6352.487413
Tri17      7033.016570
Tri18      6076.972257
Tri19      6875.100886
Tri20      7175.587179
Tri21      9126.489066
Tri22      6952.390799
Tri23      6783.466281
Tri24     10061.049263
Tri25      6404.474740
Tri26      6221.959882
Tri27      7500.719769
Tri28      6439.059402
Tri29      6449.023514
              ...     
Tri770     8775.582668
Tri771     9753.921246
Tri772     8765.415950
Tri773     8407.932097
Tri774     7119.668571
Tri775     9133.851062
Tri776     4466.085911
Tri777     8827.325748
Tri778     6830.768774
Tri779     9756.056048
Tri780     9967.211069
Tri781     6253.286042
T

# Are there missing values (NaNs) in the data?

In [32]:
isNaN = np.sum(np.isnan(df), axis=0)
print(isNaN)

#Checking if any values in data are true. If so there are NaNs
if (isNaN.any(axis=0)):
    print("There are missing values in your data")
    

Sub0      0
Sub1      0
Sub2      0
Sub3      0
Sub4      4
Sub5      0
Sub6      0
Sub7      1
Sub8      0
Sub9      2
Sub10     0
Sub11    11
Sub12     0
Sub13     3
Sub14     3
Sub15     0
Sub16     0
Sub17    15
Sub18     7
Sub19     0
dtype: int64
There are missing values in your data


## What about outliers? Lets define outliers here as > 2 * std away from the mean for each subject
* After you've found the outliers for each subject, replace those values with a np.nan (NaN)

In [45]:
#d = df.describe()

std = df.std()
mu = df.mean()
print(std)

outlier = (df > (mu+std)) or (df < (mu - std))
print(outlier)



Sub0     1779.474153
Sub1     1476.122674
Sub2     1434.749989
Sub3     1722.695784
Sub4     1394.508376
Sub5     2544.771595
Sub6     1529.182544
Sub7     2000.548574
Sub8     1302.153904
Sub9     1935.519959
Sub10    1745.629161
Sub11    2662.686275
Sub12    2898.418570
Sub13    1452.494803
Sub14    2151.655387
Sub15    1976.030065
Sub16    1497.644375
Sub17    2648.316102
Sub18    1456.803723
Sub19    2051.493761
dtype: float64


ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

## After you've found the outliers and replaced with NaNs for each subject, check out this function:
[pandas.DataFrame.interpolate](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.interpolate.html#pandas.DataFrame.interpolate)

* Use this function to interpolate the missing values for each subject (do not interpolate across subjects!)
* Just use linear interpolation...

## You can explore the "Missing Values" page for Pandas to figure out other ways of filling in missing values and outliers

[page is here](https://pandas.pydata.org/pandas-docs/stable/missing_data.html#missing-data)

* Figure out how to replace the outliers with the mean of each subject

## Use the Pandas.DataFrame.Sample function to generate bootstrapped confidence intervals for the data from subject 11

[see this page for Samples](https://pandas.pydata.org/pandas-docs/version/0.22/generated/pandas.DataFrame.sample.html)


* Resample Sub11's data with replacement, each time pulling N samples (800 in this case)
* Generate a distribution of means across all resamples
* Compute 95% confidence intervals using:

[this page for quantiles](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.quantile.html)