# Import PANDAS Library

Notice that we are using pd as an abbreviation so each time we want to use the PANDAS library, we can simply type pd instead of pandas.

In [1]:
import pandas as pd

In [2]:
pd.options.display.float_format = '{:,.2f}'.format

# Import Dataset (either full dataset or simplified)

You can use either set of datafiles (full or simplified). In the past some students' computers could not handle the full dataset. If you worry that your computer will not handle the full dataset (or if you find your code is running very slow), use the simplified version. 

The simplified version was created by taking a sample of the full data; hence the simplified files have "Sample" added to the end of the file names. Some files were not large to begin with and didn't need to be reduced. Those files do not have Sample added to the end of the file name.

## Adjust the file address below to match your computer.

You will want to replace the begining of the address below (i.e., everything before OneDrive - Oregon State University/)

In [3]:
begInv = pd.read_csv("BegInvFINAL12-31-16Sample.csv")

In [4]:
begInv.head(2)

Unnamed: 0,InventoryId,Store,City,Brand,Description,Size,onHand,Price,startDate
0,1_HARDERSFIELD_58,1,HARDERSFIELD,58,Gekkeikan Black & Gold Sake,750mL,8,12.99,2016-01-01
1,1_HARDERSFIELD_60,1,HARDERSFIELD,60,Canadian Club 1858 VAP,750mL,7,10.99,2016-01-01


In [5]:
begInv = begInv.rename(columns = {'onHand':'01_01_2016_Qty'})
begInv.head(2)

Unnamed: 0,InventoryId,Store,City,Brand,Description,Size,01_01_2016_Qty,Price,startDate
0,1_HARDERSFIELD_58,1,HARDERSFIELD,58,Gekkeikan Black & Gold Sake,750mL,8,12.99,2016-01-01
1,1_HARDERSFIELD_60,1,HARDERSFIELD,60,Canadian Club 1858 VAP,750mL,7,10.99,2016-01-01


In [6]:
begInv['InvCost01_01_2016'] = begInv['01_01_2016_Qty'] * begInv['Price']
begInv.head(2)

Unnamed: 0,InventoryId,Store,City,Brand,Description,Size,01_01_2016_Qty,Price,startDate,InvCost01_01_2016
0,1_HARDERSFIELD_58,1,HARDERSFIELD,58,Gekkeikan Black & Gold Sake,750mL,8,12.99,2016-01-01,103.92
1,1_HARDERSFIELD_60,1,HARDERSFIELD,60,Canadian Club 1858 VAP,750mL,7,10.99,2016-01-01,76.93


In [7]:
begInv_Store = begInv[['Store','01_01_2016_Qty','InvCost01_01_2016']].groupby('Store').sum()
begInv_Store.head(2)

Unnamed: 0_level_0,01_01_2016_Qty,InvCost01_01_2016
Store,Unnamed: 1_level_1,Unnamed: 2_level_1
1,49917,838079.59
2,52925,840452.2


# Deliverable 1A

In [8]:
begInv_Store.sort_values(by='InvCost01_01_2016',ascending=False).head(10)

Unnamed: 0_level_0,01_01_2016_Qty,InvCost01_01_2016
Store,Unnamed: 1_level_1,Unnamed: 2_level_1
34,153852,3291170.24
73,162551,3142497.36
67,158996,3079578.63
66,149314,2973033.9
76,140208,2952418.44
69,144255,2946726.65
38,114368,2232698.77
55,119641,2001263.66
50,94720,1649808.22
79,95330,1503149.48


# Deliverable 1B

In [9]:
begInv_Brand = begInv[['Brand','01_01_2016_Qty','InvCost01_01_2016']].groupby('Brand').sum()
begInv_Brand.head(2)

Unnamed: 0_level_0,01_01_2016_Qty,InvCost01_01_2016
Brand,Unnamed: 1_level_1,Unnamed: 2_level_1
58,281,3650.19
60,288,3165.12


# Deliverable 2A

In [10]:
endInv = pd.read_csv("EndInvFINAL12-31-16Sample.csv")

In [11]:
endInv = endInv.rename(columns = {'onHand':'End_Qty'})
endInv.head(2)

Unnamed: 0,InventoryId,Store,City,Brand,Description,Size,End_Qty,Price,endDate
0,1_HARDERSFIELD_58,1,HARDERSFIELD,58,Gekkeikan Black & Gold Sake,750mL,11,12.99,2016-12-31
1,1_HARDERSFIELD_62,1,HARDERSFIELD,62,Herradura Silver Tequila,750mL,7,36.99,2016-12-31


In [12]:
endInv['EInvCost2016'] = endInv['End_Qty'] * endInv['Price']
endInv.head(2)

Unnamed: 0,InventoryId,Store,City,Brand,Description,Size,End_Qty,Price,endDate,EInvCost2016
0,1_HARDERSFIELD_58,1,HARDERSFIELD,58,Gekkeikan Black & Gold Sake,750mL,11,12.99,2016-12-31,142.89
1,1_HARDERSFIELD_62,1,HARDERSFIELD,62,Herradura Silver Tequila,750mL,7,36.99,2016-12-31,258.93


In [13]:
endInv_Store = endInv[['Store','End_Qty','EInvCost2016']].groupby('Store').sum()
endInv_Store.head(2)

Unnamed: 0_level_0,End_Qty,EInvCost2016
Store,Unnamed: 1_level_1,Unnamed: 2_level_1
1,79827,1206845.93
2,56671,850884.06


In [14]:
endInv_Store.sort_values(by='EInvCost2016',ascending=False).head(10)

Unnamed: 0_level_0,End_Qty,EInvCost2016
Store,Unnamed: 1_level_1,Unnamed: 2_level_1
50,260717,4887260.68
73,164589,3254662.81
67,163765,3076114.82
34,145829,3074616.75
76,143866,2975945.18
69,150848,2968678.82
66,144579,2860504.99
74,166015,2803645.13
38,129397,2463906.85
55,125584,2234836.35


# Deliverable 2B

In [15]:
endInv_Brand = endInv[['Brand','End_Qty','EInvCost2016']].groupby('Brand').sum()
endInv_Brand.head(2)

Unnamed: 0_level_0,End_Qty,EInvCost2016
Brand,Unnamed: 1_level_1,Unnamed: 2_level_1
58,385,5001.15
60,146,1604.54


# Deliverable 3

In [16]:
begInv.head()

Unnamed: 0,InventoryId,Store,City,Brand,Description,Size,01_01_2016_Qty,Price,startDate,InvCost01_01_2016
0,1_HARDERSFIELD_58,1,HARDERSFIELD,58,Gekkeikan Black & Gold Sake,750mL,8,12.99,2016-01-01,103.92
1,1_HARDERSFIELD_60,1,HARDERSFIELD,60,Canadian Club 1858 VAP,750mL,7,10.99,2016-01-01,76.93
2,1_HARDERSFIELD_62,1,HARDERSFIELD,62,Herradura Silver Tequila,750mL,6,36.99,2016-01-01,221.94
3,1_HARDERSFIELD_63,1,HARDERSFIELD,63,Herradura Reposado Tequila,750mL,3,38.99,2016-01-01,116.97
4,1_HARDERSFIELD_72,1,HARDERSFIELD,72,No. 3 London Dry Gin,750mL,6,34.99,2016-01-01,209.94


In [17]:
endInv.head()

Unnamed: 0,InventoryId,Store,City,Brand,Description,Size,End_Qty,Price,endDate,EInvCost2016
0,1_HARDERSFIELD_58,1,HARDERSFIELD,58,Gekkeikan Black & Gold Sake,750mL,11,12.99,2016-12-31,142.89
1,1_HARDERSFIELD_62,1,HARDERSFIELD,62,Herradura Silver Tequila,750mL,7,36.99,2016-12-31,258.93
2,1_HARDERSFIELD_63,1,HARDERSFIELD,63,Herradura Reposado Tequila,750mL,7,38.99,2016-12-31,272.93
3,1_HARDERSFIELD_72,1,HARDERSFIELD,72,No. 3 London Dry Gin,750mL,4,34.99,2016-12-31,139.96
4,1_HARDERSFIELD_75,1,HARDERSFIELD,75,Three Olives Tomato Vodka,750mL,7,14.99,2016-12-31,104.93


In [18]:
Inventory = pd.merge(left=begInv[["InventoryId", "01_01_2016_Qty", "InvCost01_01_2016"]], 
                     right=endInv[["InventoryId", "End_Qty", "EInvCost2016"]], 
                     how="outer", 
                     on=["InventoryId"])
Inventory.head()

Unnamed: 0,InventoryId,01_01_2016_Qty,InvCost01_01_2016,End_Qty,EInvCost2016
0,1_HARDERSFIELD_58,8.0,103.92,11.0,142.89
1,1_HARDERSFIELD_60,7.0,76.93,,
2,1_HARDERSFIELD_62,6.0,221.94,7.0,258.93
3,1_HARDERSFIELD_63,3.0,116.97,7.0,272.93
4,1_HARDERSFIELD_72,6.0,209.94,4.0,139.96


In [19]:
len(Inventory)

256042

# Deliverable 4A

In [23]:
InvoicePurchases = pd.read_csv("InvoicePurchases12-31-16Sample.csv")
InvoicePurchases.head()

Unnamed: 0,VendorNumber,VendorName,InvoiceDate,PONumber,PODate,PayDate,Quantity,Dollars,Freight,Approval
0,105,ALTAMAR BRANDS LLC,2016-01-04,8124,2015-12-21,2016-02-16,6,214.26,3.47,
1,4466,AMERICAN VINTAGE BEVERAGE,2016-01-07,8137,2015-12-22,2016-02-21,15,140.55,8.57,
2,388,ATLANTIC IMPORTING COMPANY,2016-01-09,8169,2015-12-24,2016-02-16,5,106.6,4.61,
3,480,BACARDI USA INC,2016-01-12,8106,2015-12-20,2016-02-05,10100,137483.78,2935.2,
4,516,BANFI PRODUCTS CORP,2016-01-07,8170,2015-12-24,2016-02-12,1935,15527.25,429.2,


In [27]:
Vendors = InvoicePurchases[['VendorNumber', 'VendorName', 'Dollars']].groupby('VendorNumber').sum()
Vendors.sort_values(by = 'Dollars', ascending = False).head(10)
    


Unnamed: 0_level_0,VendorName,Dollars
VendorNumber,Unnamed: 1_level_1,Unnamed: 2_level_1
3960,DIAGEO NORTH AMERICA INC DIAGEO NORTH AMERIC...,50959796.85
4425,MARTIGNETTI COMPANIES MARTIGNETTI COMPANIESMAR...,27861690.02
12546,JIM BEAM BRANDS COMPANY JIM BEAM BRANDS COM...,24203151.05
17035,PERNOD RICARD USA PERNOD RICARD USA ...,24124091.56
480,BACARDI USA INC BACARDI USA INC ...,17624378.72
1392,CONSTELLATION BRANDS INC CONSTELLATION BRAND...,15573917.9
1128,BROWN-FORMAN CORP BROWN-FORMAN CORP ...,13529433.08
9165,ULTRA BEVERAGE COMPANY LLP ULTRA BEVERAGE COMP...,13210613.93
3252,E & J GALLO WINERY E & J GALLO WINERY ...,12289608.09
9552,M S WALKER INC M S WALKER INC ...,10935817.3


# Deliverable 4B

In [28]:
Vendors_Freight = InvoicePurchases[['VendorNumber', 'VendorName', 'Freight']].groupby('VendorNumber').sum()
Vendors_Freight.sort_values(by = 'Freight', ascending = False).head(10)

Unnamed: 0_level_0,VendorName,Freight
VendorNumber,Unnamed: 1_level_1,Unnamed: 2_level_1
3960,DIAGEO NORTH AMERICA INC DIAGEO NORTH AMERIC...,257032.07
4425,MARTIGNETTI COMPANIES MARTIGNETTI COMPANIESMAR...,144929.24
12546,JIM BEAM BRANDS COMPANY JIM BEAM BRANDS COM...,123880.97
17035,PERNOD RICARD USA PERNOD RICARD USA ...,123780.22
480,BACARDI USA INC BACARDI USA INC ...,89286.27
1392,CONSTELLATION BRANDS INC CONSTELLATION BRAND...,79528.99
1128,BROWN-FORMAN CORP BROWN-FORMAN CORP ...,68601.68
9165,ULTRA BEVERAGE COMPANY LLP ULTRA BEVERAGE COMP...,68054.7
3252,E & J GALLO WINERY E & J GALLO WINERY ...,61966.91
9552,M S WALKER INC M S WALKER INC ...,55551.82


# Deliverable 4C

In [40]:
InvoicePurchases.head()

Unnamed: 0,VendorNumber,VendorName,InvoiceDate,PONumber,PODate,PayDate,Quantity,Dollars,Freight,Approval,freight_per_$
0,105,ALTAMAR BRANDS LLC,2016-01-04,8124,2015-12-21,2016-02-16,6,214.26,3.47,,0.02
1,4466,AMERICAN VINTAGE BEVERAGE,2016-01-07,8137,2015-12-22,2016-02-21,15,140.55,8.57,,0.06
2,388,ATLANTIC IMPORTING COMPANY,2016-01-09,8169,2015-12-24,2016-02-16,5,106.6,4.61,,0.04
3,480,BACARDI USA INC,2016-01-12,8106,2015-12-20,2016-02-05,10100,137483.78,2935.2,,0.02
4,516,BANFI PRODUCTS CORP,2016-01-07,8170,2015-12-24,2016-02-12,1935,15527.25,429.2,,0.03


In [44]:
InvoicePurchases['freight_per_$'] = InvoicePurchases['Freight']/ InvoicePurchases['Dollars']
InvoicePurchases.head()

Unnamed: 0,VendorNumber,VendorName,InvoiceDate,PONumber,PODate,PayDate,Quantity,Dollars,Freight,Approval,freight_per_$
0,105,ALTAMAR BRANDS LLC,2016-01-04,8124,2015-12-21,2016-02-16,6,214.26,3.47,,0.02
1,4466,AMERICAN VINTAGE BEVERAGE,2016-01-07,8137,2015-12-22,2016-02-21,15,140.55,8.57,,0.06
2,388,ATLANTIC IMPORTING COMPANY,2016-01-09,8169,2015-12-24,2016-02-16,5,106.6,4.61,,0.04
3,480,BACARDI USA INC,2016-01-12,8106,2015-12-20,2016-02-05,10100,137483.78,2935.2,,0.02
4,516,BANFI PRODUCTS CORP,2016-01-07,8170,2015-12-24,2016-02-12,1935,15527.25,429.2,,0.03


In [48]:
Vendor_250000 = InvoicePurchases[['VendorNumber', 'VendorName', 'Freight', 'Dollars', 'Quantity', 'freight_per_$']].groupby(['VendorNumber', 'VendorName']).sum().sort_values(by='freight_per_$', ascending=False)
Vendor_250000.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Freight,Dollars,Quantity,freight_per_$
VendorNumber,VendorName,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4848,LAIRD & CO,1043.69,203011.58,21422,0.39
90058,ZORVINO VINEYARDS,448.68,86122.71,10550,0.38
1273,CALEDONIA SPIRITS INC,1319.77,259604.7,8202,0.37
4550,KLIN SPIRITS LLC,531.5,103484.29,6840,0.36
7255,PSP WINES,648.71,128895.69,16493,0.36


In [55]:
highest = InvoicePurchases.groupby('VendorNumber')['freight_per_$'].mean().nlargest(10)
highest.head()

VendorNumber
4848    0.01
90058   0.01
1273    0.01
4550    0.01
7255    0.01
Name: freight_per_$, dtype: float64

In [56]:
lowest = InvoicePurchases.groupby('VendorNumber')['freight_per_$'].mean().nsmallest(5)
lowest.head()

VendorNumber
54      0.00
90026   0.00
90033   0.00
5083    0.00
60      0.00
Name: freight_per_$, dtype: float64

# Deliverable 4D

In [57]:
Transactions = InvoicePurchases[(InvoicePurchases['Freight'] > 100) & (InvoicePurchases['Quantity'] <= 1000)]
Transactions.head()


Unnamed: 0,VendorNumber,VendorName,InvoiceDate,PONumber,PODate,PayDate,Quantity,Dollars,Freight,Approval,freight_per_$
11,1485,CASTLE BRANDS CORP.,2016-01-08,8152,2015-12-23,2016-02-19,320,5420.41,179.26,,0.03
16,2242,DELICATO VINEYARDS INC,2016-01-06,8139,2015-12-22,2016-02-10,808,6646.46,127.05,,0.02
19,2555,DISARONNO INTERNATIONAL LLC,2016-01-11,8192,2015-12-25,2016-02-17,385,3506.41,146.86,,0.04
23,2561,EDRINGTON AMERICAS,2016-01-08,8175,2015-12-24,2016-02-10,136,5645.24,218.18,,0.04
28,3924,HEAVEN HILL DISTILLERIES,2016-01-08,8155,2015-12-23,2016-02-15,818,7079.02,200.02,,0.03


# Deliverable 4E

In [59]:
Transactions['Freight_per_$'] = Transactions['Freight'] / Transactions['Dollars']
Transactions.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Transactions['Freight_per_$'] = Transactions['Freight'] / Transactions['Dollars']


Unnamed: 0,VendorNumber,VendorName,InvoiceDate,PONumber,PODate,PayDate,Quantity,Dollars,Freight,Approval,freight_per_$,Freight_per_$
11,1485,CASTLE BRANDS CORP.,2016-01-08,8152,2015-12-23,2016-02-19,320,5420.41,179.26,,0.03,0.03
16,2242,DELICATO VINEYARDS INC,2016-01-06,8139,2015-12-22,2016-02-10,808,6646.46,127.05,,0.02,0.02
19,2555,DISARONNO INTERNATIONAL LLC,2016-01-11,8192,2015-12-25,2016-02-17,385,3506.41,146.86,,0.04,0.04
23,2561,EDRINGTON AMERICAS,2016-01-08,8175,2015-12-24,2016-02-10,136,5645.24,218.18,,0.04,0.04
28,3924,HEAVEN HILL DISTILLERIES,2016-01-08,8155,2015-12-23,2016-02-15,818,7079.02,200.02,,0.03,0.03


In [60]:
Transactions['Freight_per_Unit'] = Transactions['Freight'] / Transactions['Quantity']
Transactions.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Transactions['Freight_per_Unit'] = Transactions['Freight'] / Transactions['Quantity']


Unnamed: 0,VendorNumber,VendorName,InvoiceDate,PONumber,PODate,PayDate,Quantity,Dollars,Freight,Approval,freight_per_$,Freight_per_$,Freight_per_Unit
11,1485,CASTLE BRANDS CORP.,2016-01-08,8152,2015-12-23,2016-02-19,320,5420.41,179.26,,0.03,0.03,0.56
16,2242,DELICATO VINEYARDS INC,2016-01-06,8139,2015-12-22,2016-02-10,808,6646.46,127.05,,0.02,0.02,0.16
19,2555,DISARONNO INTERNATIONAL LLC,2016-01-11,8192,2015-12-25,2016-02-17,385,3506.41,146.86,,0.04,0.04,0.38
23,2561,EDRINGTON AMERICAS,2016-01-08,8175,2015-12-24,2016-02-10,136,5645.24,218.18,,0.04,0.04,1.6
28,3924,HEAVEN HILL DISTILLERIES,2016-01-08,8155,2015-12-23,2016-02-15,818,7079.02,200.02,,0.03,0.03,0.24


In [65]:
Freight_Data = Transactions[['Freight_per_Unit', 'Freight_per_$', 'VendorName']]
Freight_Data.head()

Unnamed: 0,Freight_per_Unit,Freight_per_$,VendorName
11,0.56,0.03,CASTLE BRANDS CORP.
16,0.16,0.02,DELICATO VINEYARDS INC
19,0.38,0.04,DISARONNO INTERNATIONAL LLC
23,1.6,0.04,EDRINGTON AMERICAS
28,0.24,0.03,HEAVEN HILL DISTILLERIES


# Deliverable 5

In [66]:
Sales = pd.read_csv("SalesFINAL12-31-16Sample.csv")

In [67]:
Sales.head()

Unnamed: 0,InventoryId,Store,Brand,Description,Size,SalesQuantity,SalesDollars,SalesPrice,SalesDate,Volume,Classification,ExciseTax,VendorNo,VendorName
0,1_HARDERSFIELD_1009,1,1009,Rebel Yell Variety Pack,750mL 3 Pk,1,49.99,49.99,2016-01-02,750.0,1,0.79,8352,LUXCO INC
1,1_HARDERSFIELD_10238,1,10238,Layer Cake Primitivo Puglia,750mL,2,31.98,15.99,2016-01-02,750.0,2,0.22,4425,MARTIGNETTI COMPANIES
2,1_HARDERSFIELD_10239,1,10239,Cannonball Cab Svgn Cal,750mL,1,13.99,13.99,2016-01-02,750.0,2,0.11,4425,MARTIGNETTI COMPANIES
3,1_HARDERSFIELD_10266,1,10266,Klinker Brick Old Vine Znfdl,750mL,1,16.99,16.99,2016-01-09,750.0,2,0.11,9552,M S WALKER INC
4,1_HARDERSFIELD_1029,1,1029,Fulton's Harvest Apple Pie L,750mL,1,9.99,9.99,2016-01-03,750.0,1,0.79,3924,HEAVEN HILL DISTILLERIES


In [69]:
Analysis = Sales[['Description', 'SalesPrice', 'SalesDollars']].groupby('Description').sum().sort_values(by='SalesDollars', ascending=False)
Analysis.head()

Unnamed: 0_level_0,SalesPrice,SalesDollars
Description,Unnamed: 1_level_1,Unnamed: 2_level_1
Jack Daniels No 7 Black,122386.78,615149.4
Grey Goose Vodka,113719.76,565305.45
Tito's Handmade Vodka,99660.86,564900.16
Capt Morgan Spiced Rum,77164.32,481469.39
Absolut 80 Proof,82254.68,448035.22
