# Import PANDAS Library

Notice that we are using pd as an abbreviation so each time we want to use the PANDAS library, we can simply type pd instead of pandas.

In [1]:
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format

# Import Dataset (either full dataset or simplified)

You can use either set of datafiles (full or simplified). In the past some students' computers could not handle the full dataset. If you worry that your computer will not handle the full dataset (or if you find your code is running very slow), use the simplified version. 

The simplified version was created by taking a sample of the full data; hence the simplified files have "Sample" added to the end of the file names. Some files were not large to begin with and didn't need to be reduced. Those files do not have Sample added to the end of the file name.

## Adjust the file address below to match your computer.

You will want to replace the begining of the address below (i.e., everything before OneDrive - Oregon State University/)

In [2]:
begInv = pd.read_csv("BegInvFINAL12-31-16Sample.csv")

In [3]:
begInv.head(2)

Unnamed: 0,InventoryId,Store,City,Brand,Description,Size,onHand,Price,startDate
0,1_HARDERSFIELD_58,1,HARDERSFIELD,58,Gekkeikan Black & Gold Sake,750mL,8,12.99,1/1/2016
1,1_HARDERSFIELD_60,1,HARDERSFIELD,60,Canadian Club 1858 VAP,750mL,7,10.99,1/1/2016


In [4]:
begInv = begInv.rename(columns = {'onHand':'Beginning Quantity'})
begInv.head(2)

Unnamed: 0,InventoryId,Store,City,Brand,Description,Size,Beginning Quantity,Price,startDate
0,1_HARDERSFIELD_58,1,HARDERSFIELD,58,Gekkeikan Black & Gold Sake,750mL,8,12.99,1/1/2016
1,1_HARDERSFIELD_60,1,HARDERSFIELD,60,Canadian Club 1858 VAP,750mL,7,10.99,1/1/2016


In [5]:
begInv['InvCost01_01_2016'] = begInv['Beginning Quantity'] * begInv['Price']
begInv.head(2)

Unnamed: 0,InventoryId,Store,City,Brand,Description,Size,Beginning Quantity,Price,startDate,InvCost01_01_2016
0,1_HARDERSFIELD_58,1,HARDERSFIELD,58,Gekkeikan Black & Gold Sake,750mL,8,12.99,1/1/2016,103.92
1,1_HARDERSFIELD_60,1,HARDERSFIELD,60,Canadian Club 1858 VAP,750mL,7,10.99,1/1/2016,76.93


In [6]:
endInv = pd.read_csv("EndInvFINAL12-31-16Sample.csv")
endInv.head(2)

Unnamed: 0,InventoryId,Store,City,Brand,Description,Size,onHand,Price,endDate
0,1_HARDERSFIELD_58,1,HARDERSFIELD,58,Gekkeikan Black & Gold Sake,750mL,11,12.99,12/31/2016
1,1_HARDERSFIELD_62,1,HARDERSFIELD,62,Herradura Silver Tequila,750mL,7,36.99,12/31/2016


In [7]:
endInv = endInv.rename(columns = {'onHand':'Ending Quantity'})
endInv.head(2)

Unnamed: 0,InventoryId,Store,City,Brand,Description,Size,Ending Quantity,Price,endDate
0,1_HARDERSFIELD_58,1,HARDERSFIELD,58,Gekkeikan Black & Gold Sake,750mL,11,12.99,12/31/2016
1,1_HARDERSFIELD_62,1,HARDERSFIELD,62,Herradura Silver Tequila,750mL,7,36.99,12/31/2016


In [8]:
endInv['InvCost01_01_2016'] = endInv['Ending Quantity'] * endInv['Price']
endInv.head(2)

Unnamed: 0,InventoryId,Store,City,Brand,Description,Size,Ending Quantity,Price,endDate,InvCost01_01_2016
0,1_HARDERSFIELD_58,1,HARDERSFIELD,58,Gekkeikan Black & Gold Sake,750mL,11,12.99,12/31/2016,142.89
1,1_HARDERSFIELD_62,1,HARDERSFIELD,62,Herradura Silver Tequila,750mL,7,36.99,12/31/2016,258.93


# Deliverable 1

In [32]:
begInv_Store = begInv[['Store','Beginning Quantity','InvCost01_01_2016']].groupby('Store').sum()

In [33]:
begInv_Store.sort_values(by='InvCost01_01_2016').tail(10).sort_values(by='InvCost01_01_2016', ascending = False)

Unnamed: 0_level_0,Beginning Quantity,InvCost01_01_2016
Store,Unnamed: 1_level_1,Unnamed: 2_level_1
34,153852,3291170.24
73,162551,3142497.36
67,158996,3079578.63
66,149314,2973033.9
76,140208,2952418.44
69,144255,2946726.65
38,114368,2232698.77
55,119641,2001263.66
50,94720,1649808.22
79,95330,1503149.48


In [34]:
begInv_Brand = begInv[['Brand','Beginning Quantity','InvCost01_01_2016']].groupby('Brand').sum()
begInv_Brand.sort_values(by='InvCost01_01_2016').tail(10).sort_values(by='InvCost01_01_2016', ascending = False)

Unnamed: 0_level_0,Beginning Quantity,InvCost01_01_2016
Brand,Unnamed: 1_level_1,Unnamed: 2_level_1
3545,14499,463823.01
1233,12016,432455.84
8068,15341,383371.59
4261,15499,340823.01
3858,13649,327439.51
2753,4625,286703.75
8082,9287,278517.13
8680,7066,275503.34
2589,6766,270572.34
3876,14829,266773.71


# Deliverable 2

In [12]:
endInv_Store = endInv[['Store','Ending Quantity','InvCost01_01_2016']].groupby('Store').sum()
endInv_Store.head(2)
endInv_Store.sort_values(by='InvCost01_01_2016',ascending=False).head(10)
endInv_Store.sort_values(by='InvCost01_01_2016').tail(10).sort_values(by='InvCost01_01_2016', ascending = False)

Unnamed: 0_level_0,Ending Quantity,InvCost01_01_2016
Store,Unnamed: 1_level_1,Unnamed: 2_level_1
50,260717,4887260.68
73,164589,3254662.81
67,163765,3076114.82
34,145829,3074616.75
76,143866,2975945.18
69,150848,2968678.82
66,144579,2860504.99
74,166015,2803645.13
38,129397,2463906.85
55,125584,2234836.35


In [35]:
endInv_Brand = endInv[['Brand','Ending Quantity','InvCost01_01_2016']].groupby('Brand').sum()
endInv_Brand.sort_values(by='InvCost01_01_2016',ascending=False).head(10)

Unnamed: 0_level_0,Ending Quantity,InvCost01_01_2016
Brand,Unnamed: 1_level_1,Unnamed: 2_level_1
1233,15047,526494.53
3545,16770,502932.3
2753,7849,470861.51
8068,15608,366631.92
3405,12268,355649.32
4261,16769,351981.31
2757,11603,336370.97
2589,7922,300956.78
1376,13180,276648.2
2585,10487,272557.13


# Deliverable 3

In [13]:
merged1 = pd.merge(left=begInv, right=endInv, how='left', on=['InventoryId'])
merged1["Beginning Total"] = (merged1["Beginning Quantity"] * merged1["Price_x"])
merged1["Ending Total"] = (merged1["Ending Quantity"] * merged1["Price_y"])
merged1 = merged1[['InventoryId','Beginning Quantity', 'Beginning Total', 'Ending Quantity', 'Ending Total']]
merged1 = merged1.fillna(0)
merged1.head()

Unnamed: 0,InventoryId,Beginning Quantity,Beginning Total,Ending Quantity,Ending Total
0,1_HARDERSFIELD_58,8,103.92,11.0,142.89
1,1_HARDERSFIELD_60,7,76.93,0.0,0.0
2,1_HARDERSFIELD_62,6,221.94,7.0,258.93
3,1_HARDERSFIELD_63,3,116.97,7.0,272.93
4,1_HARDERSFIELD_72,6,209.94,4.0,139.96


In [14]:
len(merged1)

206529

# Deliverable 4a

In [15]:
purchases = pd.read_csv("PurchasesFINAL 12-31-16Sample.csv")
invoice_purchases = pd.read_csv("InvoicePurchases12-31-16Sample.csv")
purchases.head()

Unnamed: 0,InventoryId,Store,Brand,Description,Size,VendorNumber,VendorName,PONumber,PODate,ReceivingDate,InvoiceDate,PayDate,PurchasePrice,Quantity,Dollars,Classification
0,69_MOUNTMEND_8412,69,8412,Tequila Ocho Plata Fresno,750mL,105,ALTAMAR BRANDS LLC,8124,12/21/2015,1/2/2016,1/4/2016,2/16/2016,35.71,6,214.26,1
1,34_PITMERDEN_5215,34,5215,TGI Fridays Long Island Iced,1.75L,4466,AMERICAN VINTAGE BEVERAGE,8137,12/22/2015,1/2/2016,1/7/2016,2/21/2016,9.41,5,47.05,1
2,76_DONCASTER_2034,76,2034,Glendalough Double Barrel,750mL,388,ATLANTIC IMPORTING COMPANY,8169,12/24/2015,1/2/2016,1/9/2016,2/16/2016,21.32,5,106.6,1
3,5_SUTTON_3348,5,3348,Bombay Sapphire Gin,1.75L,480,BACARDI USA INC,8106,12/20/2015,1/2/2016,1/12/2016,2/5/2016,22.38,6,134.28,1
4,30_CULCHETH_4903,30,4903,Bacardi Superior Rum,200mL,480,BACARDI USA INC,8106,12/20/2015,1/1/2016,1/12/2016,2/5/2016,2.87,48,137.76,1


In [16]:
purchases['InvoiceDate'] = pd.to_datetime(purchases['InvoiceDate'])
vendor_purchases = purchases[purchases['InvoiceDate'].dt.year == 2016]
vendor_purchases = purchases.groupby(['VendorNumber', 'VendorName']).agg({'Dollars': 'sum'}).reset_index()
vendor_purchases_grouped = purchases.sort_values(by='Dollars',ascending=False).head(10)
vendor_purchases_grouped= vendor_purchases_grouped[['VendorNumber','VendorName', 'Dollars']]
vendor_purchases_grouped.head(10)

Unnamed: 0,VendorNumber,VendorName,Dollars
503213,480,BACARDI USA INC,36517.35
827869,480,BACARDI USA INC,32465.68
820905,8112,MOET HENNESSY USA INC,32445.54
741650,12546,JIM BEAM BRANDS COMPANY,30534.79
637575,3960,DIAGEO NORTH AMERICA INC,26727.69
503811,480,BACARDI USA INC,25961.97
127802,17035,PERNOD RICARD USA,23989.62
518029,17035,PERNOD RICARD USA,22599.36
810476,1128,BROWN-FORMAN CORP,20727.03
967833,17035,PERNOD RICARD USA,19901.72


# Deliverable 4b

In [17]:
vendor_invoice_purchases = invoice_purchases[['VendorNumber','VendorName','Freight']].groupby('VendorNumber').sum()
vendor_invoice_grouped = vendor_invoice_purchases.sort_values(by='Freight',ascending=False).head(10)
vendor_invoice_grouped.head(10)

Unnamed: 0_level_0,VendorName,Freight
VendorNumber,Unnamed: 1_level_1,Unnamed: 2_level_1
3960,DIAGEO NORTH AMERICA INC DIAGEO NORTH AMERIC...,257032.07
4425,MARTIGNETTI COMPANIES MARTIGNETTI COMPANIESMAR...,144929.24
12546,JIM BEAM BRANDS COMPANY JIM BEAM BRANDS COM...,123880.97
17035,PERNOD RICARD USA PERNOD RICARD USA ...,123780.22
480,BACARDI USA INC BACARDI USA INC ...,89286.27
1392,CONSTELLATION BRANDS INC CONSTELLATION BRAND...,79528.99
1128,BROWN-FORMAN CORP BROWN-FORMAN CORP ...,68601.68
9165,ULTRA BEVERAGE COMPANY LLP ULTRA BEVERAGE COMP...,68054.7
3252,E & J GALLO WINERY E & J GALLO WINERY ...,61966.91
9552,M S WALKER INC M S WALKER INC ...,55551.82


# Deliverable 4c

In [30]:
vendor_purchases1 =invoice_purchases[invoice_purchases["Dollars"] >= 250000]
vendor_purchases1['InvoiceDate'] = pd.to_datetime(vendor_purchases1['InvoiceDate'])
vendor_purchases = purchases[purchases['InvoiceDate'].dt.year == 2016]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vendor_purchases1['InvoiceDate'] = pd.to_datetime(vendor_purchases1['InvoiceDate'])


In [31]:
vendor_purchases1["Freight Rate"] = (vendor_purchases1["Freight"]/vendor_purchases1["Dollars"])
vendor_purchases1["FreightPerUnit"] = (vendor_purchases1["Freight"]/vendor_purchases1["Quantity"])
vendor_purchases1_grouped = vendor_purchases1[['VendorNumber','VendorName','Freight','Dollars','Quantity','Freight Rate','FreightPerUnit']].groupby('VendorNumber').sum()
vendor_purchases_sorted = vendor_purchases1_grouped.sort_values(by='Freight Rate',ascending=False).head(10)
vendor_purchases_sorted.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vendor_purchases1["Freight Rate"] = (vendor_purchases1["Freight"]/vendor_purchases1["Dollars"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vendor_purchases1["FreightPerUnit"] = (vendor_purchases1["Freight"]/vendor_purchases1["Quantity"])


Unnamed: 0_level_0,VendorName,Freight,Dollars,Quantity,Freight Rate,FreightPerUnit
VendorNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4425,MARTIGNETTI COMPANIESMARTIGNETTI COMPANIESMART...,139012.99,27432418.49,2600730,0.26,2.77
12546,JIM BEAM BRANDS COMPANY JIM BEAM BRANDS COM...,119877.31,23956423.83,2715232,0.26,2.31
3960,DIAGEO NORTH AMERICA INC DIAGEO NORTH AMERIC...,249954.76,50556954.94,5422558,0.26,2.36
17035,PERNOD RICARD USA PERNOD RICARD USA ...,117037.87,23267207.39,1582397,0.25,3.6
1392,CONSTELLATION BRANDS INC CONSTELLATION BRAND...,62069.8,12326756.42,1825237,0.19,1.26
480,BACARDI USA INC BACARDI USA INC ...,72547.44,14728714.8,1165943,0.18,2.29
1128,BROWN-FORMAN CORP BROWN-FORMAN CORP ...,43517.45,8755247.09,627082,0.14,1.95
9165,ULTRA BEVERAGE COMPANY LLP ULTRA BEVERAGE COMP...,33567.44,6527827.47,522282,0.11,1.35
3252,E & J GALLO WINERY E & J GALLO WINERY ...,26985.48,5474544.72,832842,0.09,0.62
8112,MOET HENNESSY USA INC MOET HENNESSY USA I...,16609.8,3356086.94,140015,0.05,1.19


In [20]:
vendor_purchases_sorted.sort_values(by='Freight Rate').tail(5).sort_values(by='Freight Rate', ascending = True)

Unnamed: 0_level_0,VendorName,Freight,Dollars,Quantity,Freight Rate,FreightPerUnit
VendorNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1392,CONSTELLATION BRANDS INC CONSTELLATION BRAND...,62069.8,12326756.42,1825237,0.19,1.26
17035,PERNOD RICARD USA PERNOD RICARD USA ...,117037.87,23267207.39,1582397,0.25,3.6
3960,DIAGEO NORTH AMERICA INC DIAGEO NORTH AMERIC...,249954.76,50556954.94,5422558,0.26,2.36
12546,JIM BEAM BRANDS COMPANY JIM BEAM BRANDS COM...,119877.31,23956423.83,2715232,0.26,2.31
4425,MARTIGNETTI COMPANIESMARTIGNETTI COMPANIESMART...,139012.99,27432418.49,2600730,0.26,2.77


# Deliverable 4d

In [21]:
vendor_purchases2 = invoice_purchases.drop(invoice_purchases[invoice_purchases['VendorName'] == 'EDRINGTON AMERICAS'].index)
vendor_purchases2 = vendor_purchases2[vendor_purchases2["Freight"]>=100]
vendor_purchases2 = vendor_purchases2[vendor_purchases2["Quantity"]<=1000]
vendor_purchases2 = vendor_purchases2.groupby(['VendorNumber','VendorName']).agg({'Freight':'sum','Dollars':'sum','Quantity':'sum'}).sort_values(by='Freight',ascending=False)
vendor_purchases2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Freight,Dollars,Quantity
VendorNumber,VendorName,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2561,EDRINGTON AMERICAS,2944.67,570627.52,18120
8664,"STOLI GROUP,(USA) LLC",349.02,14299.98,975
7239,REMY COINTREAU USA INC,348.07,7515.38,382
653,STATE WINE & SPIRITS,303.7,5544.89,541
6785,PALM BAY INTERNATIONAL INC,242.73,5533.18,715


# Deliverable 4e

In [22]:
vendor_purchases2 = invoice_purchases.drop(invoice_purchases[invoice_purchases['VendorName'] == 'EDRINGTON AMERICAS'].index)
vendor_purchases2["Freight Rate"] = (vendor_purchases2["Freight"]/vendor_purchases2["Dollars"])
vendor_purchases2["FreightPerUnit"] = (vendor_purchases2["Freight"]/vendor_purchases2["Quantity"])
vendor_purchases2 = vendor_purchases2[vendor_purchases2["Freight"]>=100]
vendor_purchases2 = vendor_purchases2[vendor_purchases2["Quantity"]<=1000]
vendor_purchases2 = vendor_purchases2.groupby(['VendorNumber','VendorName']).agg({'Freight':'sum','Dollars':'sum','Quantity':'sum','Freight Rate':'sum','FreightPerUnit':'sum'}).sort_values(by='Freight',ascending=False)
vendor_purchases2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Freight,Dollars,Quantity,Freight Rate,FreightPerUnit
VendorNumber,VendorName,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2561,EDRINGTON AMERICAS,2944.67,570627.52,18120,0.14,4.82
8664,"STOLI GROUP,(USA) LLC",349.02,14299.98,975,0.02,0.36
7239,REMY COINTREAU USA INC,348.07,7515.38,382,0.05,0.91
653,STATE WINE & SPIRITS,303.7,5544.89,541,0.05,0.56
6785,PALM BAY INTERNATIONAL INC,242.73,5533.18,715,0.04,0.34


The rate and freight per unit is alot less in 4e than in 4c.

# Deliverable 5

In [29]:
sales = pd.read_csv("SalesFINAL 12-31-16Sample.csv")
sales["Revenue"] = sales["SalesPrice"] * sales["SalesQuantity"]
sales_grouped = sales[["VendorNo",'VendorName',"Revenue",]].groupby(["VendorNo",'VendorName']).sum()
sales_sorted = sales_grouped.sort_values(by="Revenue", ascending=False)
sales_sorted.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Revenue
VendorNo,VendorName,Unnamed: 2_level_1
3960,DIAGEO NORTH AMERICA INC,5333892.56
4425,MARTIGNETTI COMPANIES,3169065.81
12546,JIM BEAM BRANDS COMPANY,2487675.19
17035,PERNOD RICARD USA,2449328.8
480,BACARDI USA INC,1935245.08
1392,CONSTELLATION BRANDS INC,1914760.47
3252,E & J GALLO WINERY,1446875.44
1128,BROWN-FORMAN CORP,1437455.0
9165,ULTRA BEVERAGE COMPANY LLP,1405457.91
9552,M S WALKER INC,1205760.39


I decided to do an analysis of the top ten vendors who made the most in revenue. Diageo North America Inc earned $5,333,892.56 in revenue from 2016. 