In [1]:
# Import necessary Libraries
import numpy as np
import pandas as pd
import sys  
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = 10, 8
from pprint import pprint

In [2]:
# Load Data
foreign_mat = pd.read_csv('C:/Users/Krista/Documents/CMU/Spring 2017/Systems/Data/Foreign_Matter.csv')
micro = pd.read_csv('C:/Users/Krista/Documents/CMU/Spring 2017/Systems/Data/Micro_Screen.csv')
moisture = pd.read_csv('C:/Users/Krista/Documents/CMU/Spring 2017/Systems/Data/Moisture_Content.csv')
solvent = pd.read_csv('C:/Users/Krista/Documents/CMU/Spring 2017/Systems/Data/Solvent.csv')
potency = pd.read_csv('C:/Users/Krista/Documents/CMU/Spring 2017/Systems/Data/potencytests.csv') 
samples = pd.read_csv('C:/Users/Krista/Documents/CMU/Spring 2017/Systems/Data/Sample_lab_time.csv') 

In [3]:
# Create Test Type Column
foreign_mat['Test'] = 'Foreign Matter'
micro['Test'] = 'Micro Screen'
moisture['Test'] = 'Moisture Content'
solvent['Test'] = 'Solvent Screen'

# Combine Foreign Matter, Moisture Content and Solvent Screening
tests = pd.concat([foreign_mat, micro, moisture, solvent])
tests.sort_values('sample_id', ascending=True, inplace=True )
tests.head()

Unnamed: 0,sample_id,name,value,failure,location,orgid,Test
374,19,yeast_and_mold,8800.0,0,139,170,Micro Screen
373,19,bile_tolerant,0.0,0,139,170,Micro Screen
372,19,aerobic_bacteria,0.0,0,139,170,Micro Screen
74,19,moisture,6.0,0,139,170,Moisture Content
148,19,Stems,0.0,0,139,170,Foreign Matter


In [4]:
# Subset Potency data to lab and location level only
labs = pd.DataFrame(samples, columns=['sample_id', 'test_date', 'lab_license'])
labs.sort_values('sample_id', ascending=True, inplace=True )
labs.head()

Unnamed: 0,sample_id,test_date,lab_license
0,18,6/19/2014,4
1,19,6/19/2014,4
2,20,6/19/2014,4
3,21,6/19/2014,4
4,22,6/19/2014,4


In [5]:
merge = pd.merge(tests, labs, on='sample_id', how='left')
merge.head()

Unnamed: 0,sample_id,name,value,failure,location,orgid,Test,test_date,lab_license
0,19,yeast_and_mold,8800.0,0,139,170,Micro Screen,6/19/2014,4
1,19,bile_tolerant,0.0,0,139,170,Micro Screen,6/19/2014,4
2,19,aerobic_bacteria,0.0,0,139,170,Micro Screen,6/19/2014,4
3,19,moisture,6.0,0,139,170,Moisture Content,6/19/2014,4
4,19,Stems,0.0,0,139,170,Foreign Matter,6/19/2014,4


In [8]:
fails = merge.groupby(['location', 'Test'])['failure'].sum().reset_index()
#Lab_group = spanish.sort_values('spanish', ascending=False)
fails.head()

Unnamed: 0,location,Test,failure
0,124,Foreign Matter,0
1,124,Micro Screen,31
2,124,Moisture Content,1
3,126,Foreign Matter,0
4,126,Micro Screen,4


In [9]:
test_tot = merge.groupby(['location', 'Test'])['failure'].count().reset_index()
#Lab_group = spanish.sort_values('spanish', ascending=False)
test_tot.head()

Unnamed: 0,location,Test,failure
0,124,Foreign Matter,260
1,124,Micro Screen,665
2,124,Moisture Content,130
3,126,Foreign Matter,474
4,126,Micro Screen,1545


In [12]:
fail_merge = pd.merge(fails, test_tot, on=['location','Test'], how='left')
fail_merge = fail_merge.rename(columns = {
    'failure_x':'Num Fail', 'failure_y':'Total Tests'})
fail_merge['Fail Rate'] = fail_merge['Num Fail']/fail_merge['Total Tests']
fail_merge.head()

Unnamed: 0,location,Test,Num Fail,Total Tests,Fail Rate
0,124,Foreign Matter,0,260,0.0
1,124,Micro Screen,31,665,0.046617
2,124,Moisture Content,1,130,0.007692
3,126,Foreign Matter,0,474,0.0
4,126,Micro Screen,4,1545,0.002589


In [13]:
Labs_pivot = merge.pivot_table(values=['failure'], index=['lab_license', 'Test'], aggfunc=np.sum)
Labs_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,failure
lab_license,Test,Unnamed: 2_level_1
1,Foreign Matter,0
1,Micro Screen,148
1,Moisture Content,4
1,Solvent Screen,0
2,Foreign Matter,33
2,Micro Screen,344
2,Moisture Content,9
2,Solvent Screen,3
3,Foreign Matter,8
3,Micro Screen,3448


In [17]:
Labs_pivot_count = merge.pivot_table(values=['failure'], index=['lab_license', 'Test'], aggfunc=[np.sum, len])
Labs_pivot_count

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,len
Unnamed: 0_level_1,Unnamed: 1_level_1,failure,failure
lab_license,Test,Unnamed: 2_level_2,Unnamed: 3_level_2
1,Foreign Matter,0,764
1,Micro Screen,148,3810
1,Moisture Content,4,378
1,Solvent Screen,0,25
2,Foreign Matter,33,1460
2,Micro Screen,344,9877
2,Moisture Content,9,724
2,Solvent Screen,3,1250
3,Foreign Matter,8,66484
3,Micro Screen,3448,189285


In [7]:
fails = tests.groupby(['Test'])['failure'].sum().reset_index()
#Lab_group = spanish.sort_values('spanish', ascending=False)
fails.head(25)

Unnamed: 0,Test,failure
0,Foreign Matter,121
1,Micro Screen,14400
2,Moisture Content,441
3,Solvent Screen,561


In [8]:
tests['failure'].value_counts()

0    921465
1     15523
Name: failure, dtype: int64

In [13]:
# Create a set of dummy variables from the sex variable
df_all = pd.get_dummies(tests['Test'])
# Join the dummy variables to the main dataframe
tests_new = pd.concat([tests, df_all], axis=1)
tests_new.head(20)

Unnamed: 0,sample_id,name,value,failure,location,orgid,Test,Foreign Matter,Micro Screen,Moisture Content,Solvent Screen
374,19,yeast_and_mold,8800.0,0,139,170,Micro Screen,0.0,1.0,0.0,0.0
373,19,bile_tolerant,0.0,0,139,170,Micro Screen,0.0,1.0,0.0,0.0
372,19,aerobic_bacteria,0.0,0,139,170,Micro Screen,0.0,1.0,0.0,0.0
74,19,moisture,6.0,0,139,170,Moisture Content,0.0,0.0,1.0,0.0
148,19,Stems,0.0,0,139,170,Foreign Matter,1.0,0.0,0.0,0.0
371,19,coliforms,0.0,0,139,170,Micro Screen,0.0,1.0,0.0,0.0
370,19,e_coli_and_salmonella,0.0,0,139,170,Micro Screen,0.0,1.0,0.0,0.0
149,19,Other,0.0,0,139,170,Foreign Matter,1.0,0.0,0.0,0.0
377,20,aerobic_bacteria,200.0,0,139,170,Micro Screen,0.0,1.0,0.0,0.0
379,20,coliforms,0.0,0,139,170,Micro Screen,0.0,1.0,0.0,0.0


In [14]:
total_tests = pd.DataFrame(merge, columns=['location', 'Foreign Matter', 'Micro Screen','Moisture Content', 'Solvent Screen'])
total_tests.head()

Unnamed: 0,location,Foreign Matter,Micro Screen,Moisture Content,Solvent Screen
0,139,,,,
1,139,,,,
2,139,,,,
3,139,,,,
4,139,,,,


In [38]:
Lab_group = total_tests.groupby(['lab_license'])['Foreign Matter', 'Micro Screen', 'Moisture Content', 'Solvent Screen'].sum().reset_index()
#Lab_group = spanish.sort_values('spanish', ascending=False)
Lab_group.head(25)

Unnamed: 0,lab_license,Foreign Matter,Micro Screen,Moisture Content,Solvent Screen
0,1,764.0,3810.0,378.0,25.0
1,2,1460.0,9877.0,724.0,1250.0
2,3,66484.0,189285.0,32948.0,3142.0
3,4,18298.0,62251.0,9083.0,2538.0
4,5,1712.0,4815.0,851.0,92.0
5,6,5246.0,17032.0,2620.0,64.0
6,7,24342.0,68570.0,12071.0,862.0
7,8,5938.0,18365.0,2949.0,113.0
8,9,20944.0,72140.0,10443.0,2871.0
9,10,5286.0,14605.0,2628.0,298.0


In [41]:
fails2 = pd.DataFrame(merge, columns=['lab_license', 'Foreign Matter', 'Micro Screen','Moisture Content', 'Solvent Screen', 'failure'])
fails2.head()

Unnamed: 0,lab_license,Foreign Matter,Micro Screen,Moisture Content,Solvent Screen,failure
0,4,0.0,1.0,0.0,0.0,0
1,4,0.0,1.0,0.0,0.0,0
2,4,0.0,1.0,0.0,0.0,0
3,4,0.0,0.0,1.0,0.0,0
4,4,1.0,0.0,0.0,0.0,0


In [15]:
Labs_pivot2 = merge.pivot_table(values=['failure'], index=['location'], columns=['Test'], aggfunc=[np.sum,len])
Labs_pivot2

Unnamed: 0_level_0,sum,sum,sum,sum,len,len,len,len
Unnamed: 0_level_1,failure,failure,failure,failure,failure,failure,failure,failure
Test,Foreign Matter,Micro Screen,Moisture Content,Solvent Screen,Foreign Matter,Micro Screen,Moisture Content,Solvent Screen
location,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
124,0.0,31.0,1.0,,260.0,665.0,130.0,
126,0.0,4.0,1.0,4.0,474.0,1545.0,237.0,74.0
127,0.0,35.0,0.0,,504.0,1260.0,246.0,
139,0.0,19.0,0.0,0.0,330.0,825.0,165.0,1.0
140,0.0,21.0,0.0,,298.0,915.0,148.0,
141,0.0,51.0,0.0,,436.0,1090.0,210.0,
142,0.0,17.0,0.0,,144.0,350.0,70.0,
143,0.0,1.0,0.0,,748.0,1955.0,374.0,
144,0.0,16.0,0.0,,484.0,1220.0,242.0,
145,0.0,19.0,0.0,,276.0,690.0,138.0,


In [17]:
Labs_pivot2.to_csv('C:/Users/Krista/Documents/CMU/Spring 2017/Systems/Data/Producer_fail_rate.csv', index=True)

In [8]:
# Subset tests data for merging
tests = pd.DataFrame(tests, columns=['sample_id', 'failure','Test'])
tests.sort_values('sample_id', ascending=True, inplace=True )
tests.head()

Unnamed: 0,sample_id,failure,Test
374,19,0,Micro Screen
373,19,0,Micro Screen
372,19,0,Micro Screen
74,19,0,Moisture Content
148,19,0,Foreign Matter


In [35]:
Total_tests = pd.DataFrame(merge, columns=['lab_license', 'Test'])
# Create a set of dummy variables from the sex variable
tests_all = pd.get_dummies(tests['Test'])
# Join the dummy variables to the main dataframe
tests_new = pd.concat([tests, df_all], axis=1)
tests_new.head()
Total_tests.head()

Unnamed: 0,lab_license,Test
0,4,Micro Screen
1,4,Micro Screen
2,4,Micro Screen
3,4,Moisture Content
4,4,Foreign Matter


In [6]:
#Define a generic function using Pandas replace function
def coding(col, codeDict):
  colCoded = pd.Series(col, copy=True)
  for key, value in codeDict.items():
    colCoded.replace(key, value, inplace=True)
  return colCoded

tests["failure"] = coding(tests["failure"], {0:'Pass',1:'Fail'})


tests_new['Foreign Matter'] = coding(tests_new['Foreign Matter'], {0.0:'Pass',1.0:'Fail'})
tests_new['Micro Screen'] = coding(tests_new['Micro Screen'], {0.0:'Pass',1.0:'Fail'})
tests_new['Moisture Content'] = coding(tests_new['Moisture Content'], {0.0:'Pass',1.0:'Fail'})
tests_new['Solvent Screen'] = coding(tests_new['Solvent Screen'], {0.0:'Pass',1.0:'Fail'})
tests_new.head(3)
tests.head()

Unnamed: 0,sample_id,name,value,failure,location,orgid,Test
374,19,yeast_and_mold,8800.0,Pass,139,170,Micro Screen
373,19,bile_tolerant,0.0,Pass,139,170,Micro Screen
372,19,aerobic_bacteria,0.0,Pass,139,170,Micro Screen
74,19,moisture,6.0,Pass,139,170,Moisture Content
148,19,Stems,0.0,Pass,139,170,Foreign Matter


In [14]:
# Subset tests data for merging
tests_samples = pd.DataFrame(tests, columns=['orgid', 'failure', 'location','Test'])
tests_samples.sort_values('orgid', ascending=True, inplace=True )
tests_samples.head()

Unnamed: 0,orgid,failure,location,Test
39976,163,0,142,Moisture Content
5034,163,0,142,Foreign Matter
5035,163,0,142,Foreign Matter
36588,163,0,142,Moisture Content
36587,163,0,142,Moisture Content


In [13]:
# Subset Potency data to lab and location level only
labs = pd.DataFrame(samples, columns=['sample_id', 'test_date', 'lab_license'])
labs.sort_values('sample_id', ascending=True, inplace=True )
labs.head()

Unnamed: 0,sample_id,test_date,lab_license
0,18,6/19/2014,4
1,19,6/19/2014,4
2,20,6/19/2014,4
3,21,6/19/2014,4
4,22,6/19/2014,4


In [19]:
merge = pd.merge(tests, labs, on='sample_id', how='left')
merge.head(7)

Unnamed: 0,sample_id,name,value,failure,location,orgid,Test,test_date,lab_license
0,19,yeast_and_mold,8800.0,0,139,170,Micro Screen,6/19/2014,4
1,19,bile_tolerant,0.0,0,139,170,Micro Screen,6/19/2014,4
2,19,aerobic_bacteria,0.0,0,139,170,Micro Screen,6/19/2014,4
3,19,moisture,6.0,0,139,170,Moisture Content,6/19/2014,4
4,19,Stems,0.0,0,139,170,Foreign Matter,6/19/2014,4
5,19,coliforms,0.0,0,139,170,Micro Screen,6/19/2014,4
6,19,e_coli_and_salmonella,0.0,0,139,170,Micro Screen,6/19/2014,4


In [20]:
# impute_grps = data.pivot_table(values=["LoanAmount"], index=["Gender","Married","Self_Employed"], aggfunc=np.mean)

Labs_pivot = merge.pivot_table(values=['failure'], index=['lab_license', 'Test'], aggfunc=np.sum)
Labs_pivot


Unnamed: 0_level_0,Unnamed: 1_level_0,failure
lab_license,Test,Unnamed: 2_level_1
1,Foreign Matter,0
1,Micro Screen,148
1,Moisture Content,4
1,Solvent Screen,0
2,Foreign Matter,33
2,Micro Screen,344
2,Moisture Content,9
2,Solvent Screen,3
3,Foreign Matter,8
3,Micro Screen,3448


In [18]:
print(len(merge))

936988


In [7]:
samples.head()

Unnamed: 0,id,inventoryid,quantity,sessiontime,inventorytype,strain,product_name,deleted,result,lab_license,...,transactionid,parentid,received,received_quantity,transactionid_original,sample_amount_used,sample_amount_destroyed,sample_amount_other,other_sample_id,inventoryparentid
0,18,397254000000000.0,7.0,1403191624,13,Double Purple Doja,,1,0,4,...,34955,6032850000000000.0,0,,34955,,,,,6032850000000000.0
1,19,5721620000000000.0,7.0,1403191980,13,Dutch Hawaiian,,0,1,4,...,34977,6032850000000000.0,1,7.0,34977,7.0,,,,6032850000000000.0
2,20,684306000000000.0,7.0,1403192191,13,Shiska Berry,,0,1,4,...,34990,6032850000000000.0,1,7.0,34990,7.0,,,,6032850000000000.0
3,21,8933330000000000.0,7.0,1403192572,13,Sleestack,,0,1,4,...,35012,6032850000000000.0,1,7.0,35012,7.0,,,,6032850000000000.0
4,22,2103310000000000.0,7.0,1403192711,13,Space Needle,,0,1,4,...,35019,6032850000000000.0,1,7.0,35019,7.0,,,,6032850000000000.0


In [11]:
# Subset Potency data to lab and location level only
labs_samples = pd.DataFrame(samples, columns=['orgid', 'lab_license'])
labs_samples.sort_values('orgid', ascending=True, inplace=True )
labs_samples.head(20)

Unnamed: 0,orgid,lab_license
61420,163,7
71520,163,7
71519,163,7
71518,163,7
71517,163,7
71516,163,7
55399,163,7
55400,163,7
55401,163,7
55403,163,7


In [None]:
merge = pd.merge(tests_samples, labs, on='location', how='left')
merge.head(20)