##### Author: Jimin Kim (jk55@uw.edu)
##### Version 1.9.0

# LAB4: 
# WORKING WITH REAL WORLD STRUCTURED DATA

## Part 1: Data Formats

### 1.1 - LOADING CSV DATA WITH PANDAS: DIABETES DATA

In [None]:
import pandas as pd

diabetes = pd.read_csv('diabetes.csv')

# Preview first few rows with head()
diabetes.head()

In [None]:
print(diabetes.columns[0:5]) # Get the column names via using .columns directtive

In [None]:
for name in diabetes.columns: # You can access each column name (string) via iterating through .columns()
    
    print(name)

In [None]:
type(diabetes) # Loaded csv file is a pandas DataFrame object

In [None]:
diabetes_np = diabetes.to_numpy() # Convert to Numpy array with .to_numpy() 
print(diabetes_np.shape)          # Converted Numpy array has shape (768, 9)

## Part 2: Data Structures in Python

### 2.1 - DATA STRUCTURES: NUMPY ARRAYS []

In [None]:
import numpy as np

In [None]:
# 1D

array_1d = diabetes_np[:, 1] # Glucose column
print(array_1d.shape)

In [None]:
# 2D

array_2d = diabetes_np[:, 1:4] # Glucose column - skin thickness
print(array_2d.shape)

In [None]:
# 3D

diabetes_np_first100 = diabetes_np[:100, :]           # First 100 rows (row 0 - row 100)
diabetes_np_100_to_200 = diabetes_np[100:200, :]      # Row 100 - 200

print(diabetes_np_first100.shape, diabetes_np_100_to_200.shape)  # Each sub-data is 2D array

array_3d = np.stack([diabetes_np_first100, diabetes_np_100_to_200]) # Using np.stack() to combine 2D arrays -> 3D
print(array_3d.shape)

### 2.2 - DATA STRUCTURES: TUPLES ()

In [None]:
tuple_1 = (1,2,3,4,5)            # Tuple with homogeneous data (i.e. items are of same data type)

print(tuple_1)

In [None]:
tuple_2 = (1,2,3, 'banana', 'apple', 'orange') # Tuple with heterogeneous data (i.e. items are of different data type)

print(tuple_2)

In [None]:
# tuple vs list

tuple_1 = (1,2,3,4,5)        
list_1 = [1,2,3,4,5]

In [None]:
# Changing first element of the list to 10

list_1[0] = 10
print(list_1)

In [None]:
# Doing the same results in an error with tuple

tuple_1[0] = 10

### 2.3 - DATA STRUCTURES: DICTIONARIES {}

In [None]:
# Dictionary example
# Data are stored in ‘keys’ – “Department”, “instructor” …
# Each Key can store 'ANY' data types - String, integer, float, list, np.array etc...

ece241_dict = {
    
    "Department": 'UW ECE',
    "Instructor": 'Jimin Kim',
    "Number of students": 100,
    "Number of students per lab": np.array([20, 24, 24, 24, 8]),
    "Topics covered": ['Python', 'Signal processing', 'Data Types']
}

In [None]:
ece241_dict.keys()  # Dict.keys() displays all the keys within the dictionary

In [None]:
# Data are accessed via referring to keys

print(ece241_dict['Department'])

print(ece241_dict['Number of students per lab'])

print(ece241_dict['Topics covered'])

In [None]:
# Adding a key to dictionary

ece241_dict['Meeting times'] = ['M', 'T', 'W', 'Th', 'F'] # Adding a key which gives access to a python list of strings
ece241_dict['Number of instructors per lab'] = np.array([1,1,2,2,2]) # Adding a key which gives access to a numpy array

print(ece241_dict['Meeting times'])
print(ece241_dict['Number of instructors per lab'])

In [None]:
# Deleting a key from dictionary

del ece241_dict['Topics covered']

print(ece241_dict.keys()) # The dictionary no longer includes 'Topics covered' anymore

## Part 3: Visualizing Data

### 3.1 - TIMESERIES PLOTS

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Load TSLA.csv and GOOGL.csv stock data

tesla = pd.read_csv('TSLA.csv')
google = pd.read_csv('GOOGL.csv')

tesla_np = tesla.to_numpy()
google_np = google.to_numpy()

In [None]:
fig = plt.figure(figsize=(23,5))

# Repeating plt.plot() functions automatically overlay one plot to the other
plt.plot(tesla_np[:len(tesla_np), 4], linewidth = 5, color = 'blue') 
plt.plot(google_np[:len(tesla_np), 4], linewidth = 5, color = 'red')
plt.xticks(fontsize=20) # adjusts the size of x-axis ticks
plt.yticks(fontsize=20) # adjusts the size of y-axis ticks
plt.xlabel('Closing Price', fontsize = 25)
plt.ylabel('Days', fontsize = 25)

### 3.2 - SCATTER PLOTS

In [None]:
fig = plt.figure(figsize=(25,7))

# plt.scatter takes 2 1D-arrays each of them corresponding to x-axis and y-axis data respectively
    
plt.subplot(1, 2, 1)    
plt.scatter(diabetes_np[:, 1], diabetes_np[:, 5], color = 'black')
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.xlabel('Glucose', fontsize = 25)
plt.ylabel('BMI', fontsize = 25)

plt.subplot(1, 2, 2)
plt.scatter(diabetes_np[:, 1], diabetes_np[:, 4], color = 'black')
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.xlabel('Glucose', fontsize = 25)
plt.ylabel('Insulin', fontsize = 25)

### 3.3 - BAR GRAPHS

In [None]:
diabetes_pos_ind = diabetes_np[:, -1] == 1  # Extract rows index with diabetes and no diabetes using Boolean masks
diabetes_neg_ind = diabetes_np[:, -1] == 0  # We use the same boolean mask technique from Lab 3 for image subsetting
                                            # diabetes_np[:, -1] extracts the last column of diabetes data (i.e. outcomes)
                                            # outcome = 1: diabetic, outcome = 0: non-diabetic
                                            # diabetes_np[:, -1] == 1 is an equivalence statement which returns a 1D bool array 
                                            # If True - the row has outcome = 1, if False = the row has outcome = 0

print(diabetes_pos_ind[:100])                

In [None]:
# You can use this bool array to subset only diabetic and non-diabetic samples
# Using the bool array in the left side of colon inside the indexing bracket 
# will extract only the rows of diabetes_np that corresponds to True 
# in diabetes_pos_ind (i.e. True if diabetic) or diabetes_neg_ind (True if non-diabetic)
diabetes_np_pos = diabetes_np[diabetes_pos_ind, :]  
diabetes_np_neg = diabetes_np[diabetes_neg_ind, :]

In [None]:
# Construct x-label string list

x_labels = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DPF', 'Age']

fig = plt.figure(figsize=(20,10))

# Plot bar graphs of averaged attributes for each dataset 

plt.subplot(1, 2, 1)

plt.bar(x_labels, diabetes_np_pos.mean(axis = 0)[:-1], color = 'blue')
plt.title('Diabetic', fontsize = 20)
plt.xticks(fontsize=20, rotation = 90) # rotation takes degree values to rotate the x-ticks
plt.yticks(fontsize=20)
plt.ylim(0, 150)

plt.subplot(1, 2, 2)

plt.bar(x_labels, diabetes_np_neg.mean(axis = 0)[:-1], color = 'red')
plt.title('Non-Diabetic', fontsize = 20)
plt.xticks(fontsize=20, rotation = 90)
plt.yticks(fontsize=20)
plt.ylim(0, 150)

### 3.4 - COLORMAPS

In [None]:
# Select the columns to visualize (columns 1,2,3,4)
# Subset the rows and columns to visualize (row:1500th – 1600th days, columns: 1,2,3,4)

tesla_2_visualize = tesla_np[1500:1600, [1,2,3,4]] 

# Apply transpose to dataset so that rows = attributes, columns = days

tesla_2_visualize = tesla_2_visualize.T

In [None]:
fig = plt.figure(figsize=(30,5))

# Convert the array type to ‘float’ to make sure the data is compatible with colormap function 

plt.pcolor(tesla_2_visualize.astype('float'), cmap = 'jet')
plt.xlabel('Days', fontsize = 20)
plt.ylabel('Open, High, Low, Close', fontsize = 20)
plt.xticks(fontsize=20)
plt.yticks(color='white') # Setting ticks to white color effectively hides the label
plt.colorbar()

### 3.5 - HISTOGRAMS

In [None]:
fig = plt.figure(figsize=(40,5))

plt.subplot(1, 2, 1)

plt.hist(diabetes_np_pos[:, 1], color = 'blue', bins = 50)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.xlabel('Glucose', fontsize = 40)
plt.ylabel('n', fontsize = 40)

plt.subplot(1, 2, 2)

plt.hist(diabetes_np_neg[:, 1], color = 'red', bins = 50)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.xlabel('Glucose', fontsize = 40)
plt.ylabel('n', fontsize = 40)

## Part 4: Processing and Analyzing Data

### 4.1 - BASIC MATH OPERATIONS: 

In [None]:
# Test matrix

array2d = np.vstack([np.arange(1, 6), np.arange(6, 11), np.arange(11, 16), np.arange(16, 21), np.arange(21, 26)])

# SUMMATION ALONG AXIS

print(array2d.sum(axis = 0))
print(array2d.sum(axis = 1))

In [None]:
# AVERAGING ALONG AXIS

print(array2d.mean(axis = 0))
print(array2d.mean(axis = 1))

In [None]:
# MINIMUM ALONG AXIS

print(array2d.min(axis = 0))
print(array2d.min(axis = 1))

In [None]:
# MAXIMUM ALONG AXIS

print(array2d.max(axis = 0))
print(array2d.max(axis = 1))

### 4.2 - STATISTICAL ANALYSIS: CONFIDENCE INTERVALS

In [None]:
import scipy.stats as st                # Import scipy.stats to use pre-built statstical functions

glucose_control = diabetes_np_neg[:, 1] # Extract glucose column from non-diabetic dataset

# st.t.interval() computes lower and upper bound for provided confidence level
# confidence - confidence level
# df - degree of freedom (size of the data - 1)
# loc - The mean value of the data
# scale = standard error of the data
CI_99_lower, CI_99_upper = st.t.interval(confidence=0.99, df=len(glucose_control)-1, 
                                      loc=np.mean(glucose_control), scale=st.sem(glucose_control))

h = CI_99_upper - np.mean(glucose_control) # Confidence interval size

In [None]:
fig = plt.figure(figsize=(7,5))

plt.bar(['Non-diabetic'], [109.98],                  # Define x and y-axis data for the bar
        width = 0.5, color = 'black',                # Define visual property of the main bar
        yerr = [h], ecolor = 'grey',                 # Define the confidence interval size and error bar color
        error_kw=dict(lw=5, capsize=50, capthick=5)) # Define visual properties of the confidence interval
plt.xlim(-1, 1)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.ylabel('Glucose', fontsize = 20)