In [12]:
import pandas as pd
import numpy as np
import sqlite3
import os

print(os.getcwd())

/home/lh711/final


# Download & Clean Each Data Set

We have 3 different data sets that we are using for the project.
1. Coffee Quality Data
- From the Coffee Quality Data we extracted the Country Name, No. of Bags, Harvest Year and Bag Weight.
2. Meat Production Data
- From the Meat Dataset we focused on analyzing Country, Year, and Production in tonns
3. CO2 Emmisions
- From the CO2 Emmisions dataset we focused on the Country, Year, and the CO2 Emissions in kiloton (kt)
4. Milk Production Dataset
- From the Milk Production Dataset 

## Coffee Quality Dataset

In [13]:
project_dir = os.getcwd()

# Construct the full path to the CSV file
file_path = os.path.join(project_dir, 'coffee.csv')

# Read the CSV file
df = pd.read_csv(file_path)

#CLEANING UP THE DATA (getting rid of unneccessary info ie. columns)
dimensions = df.shape
print("Dimensions before Cleaning:", dimensions)
print(df.head(6))

df = df[["Country of Origin", "Number of Bags", "Bag Weight", "Harvest Year"]]#Taking only the columns we need

dimensions = df.shape
print("\nDimensions after Cleaning:",dimensions)
print(df.head(6))

#May want to drop all the columns except the country name, no. of bags, harvest year and bag weight.

Dimensions before Cleaning: (207, 41)
   Unnamed: 0  ID Country of Origin                 Farm Name  \
0           0   0          Colombia          Finca El Paraiso   
1           1   1            Taiwan  Royal Bean Geisha Estate   
2           2   2              Laos        OKLAO coffee farms   
3           3   3        Costa Rica                 La Cumbre   
4           4   4          Colombia           Finca Santuario   
5           5   5         Guatemala                 La Colina   

                                Lot Number                           Mill  \
0                               CQU2022015               Finca El Paraiso   
1  The 2022 Pacific Rim Coffee Summit,T037       Royal Bean Geisha Estate   
2  The 2022 Pacific Rim Coffee Summit,LA01  oklao coffee processing plant   
3                               CQU2022017        La Montana Tarrazu MIll   
4                               CQU2023002                Finca Santuario   
5  The 2022 Pacific Rim Coffee Summit,GT02  

### Null/Missing Values

In [14]:
#Finding Missing Values
#Column 1
missing_countries = df["Country of Origin"].isnull().sum()
print("Amount of Missing Values for Column 1:", missing_countries)

#Column 2
missing_bags = df["Number of Bags"].isnull().sum()
print("Amount of Missing Values for Column 2:", missing_bags)

#Column 3
missing_weight = df["Bag Weight"].isnull().sum()
print("Amount of Missing Values for Column 3:", missing_weight)

#Column 4
missing_year = df["Harvest Year"].isnull().sum()
print("Amount of Missing Values for Column 4:", missing_year)

#Finding Null Values

Amount of Missing Values for Column 1: 0
Amount of Missing Values for Column 2: 0
Amount of Missing Values for Column 3: 0
Amount of Missing Values for Column 4: 0


## Meat Dataset

In [43]:
project_dir = os.getcwd()

# Construct the full path to the CSV file
file_path = os.path.join(project_dir, 'meat.csv')
df = pd.read_csv(file_path)
#df = pd.read_csv('/home/ssg145/final/meat.csv')
initial_size = df.shape[0]

dimensions = df.shape
print("Dimensions before Cleaning:", dimensions)
print(df.head(6))

#CLEANING UP DATA
df = df.drop(columns=['Code'])
#print(df.head(6))

df = df.rename(columns={'Meat, total | 00001765 || Production | 005510 || tonnes': 'Production amt'})

dimensions = df.shape
print("\nDimensions after Cleaning:", dimensions)
print(df.head(6))


Dimensions before Cleaning: (14382, 4)
               Country Code  Year  \
0              Bahamas  BHS  1961   
1               Brunei  BRN  1961   
2                Qatar  QAT  1961   
3        Faroe Islands  FRO  1961   
4               Tuvalu  TUV  1961   
5  Antigua and Barbuda  ATG  1961   

   Meat, total | 00001765 || Production | 005510 || tonnes  
0                                             1260.7        
1                                             1289.8        
2                                             1769.2        
3                                                0.0        
4                                               30.0        
5                                              395.0        

Dimensions after Cleaning: (14382, 3)
               Country  Year  Production amt
0              Bahamas  1961          1260.7
1               Brunei  1961          1289.8
2                Qatar  1961          1769.2
3        Faroe Islands  1961             0.0
4         

### Null/Missing Values

In [44]:
#Finding Missing Values
#Column 1
missing_countries = df["Country"].isnull().sum()
print("Amount of Missing Values for Column 1:", missing_countries)

#Column 2
missing_year = df["Year"].isnull().sum()
print("Amount of Missing Values for Column 2:", missing_year)

#Column 3
missing_production = df["Production amt"].isnull().sum()
print("Amount of Missing Values for Column 3:", missing_production)

#Finding Null Values: there are country values that are continents or low income/high income countries
print(df.shape)
bad_values = ["Asia", "North America", "Africa", "South America", "Europe", "Antartica", "World", "Lower-middle-income countries", "Upper-middle-income countries", "High-income countries", "Low-income countries"]
df = df[~df["Country"].str.contains("|".join(bad_values), case = False, na = False)]
print(df.shape)

Amount of Missing Values for Column 1: 0
Amount of Missing Values for Column 2: 0
Amount of Missing Values for Column 3: 0
(14382, 3)
(12429, 3)


## CO2 Emmissions Dataset

In [6]:
#df = pd.read_csv('./final/co2_country.csv')
project_dir = os.getcwd()

# Construct the full path to the CSV file
file_path = os.path.join(project_dir, 'co2_country.csv')
df = pd.read_csv(file_path)

dimensions = df.shape
print("Dimensions before Cleaning:", dimensions)
print(df.head(6))

#Clean the data (get rid of the country codes)
df = df.iloc[:, 1:]

dimensions = df.shape
print("\nDimensions before Cleaning:", dimensions)
print(df.head(6))

Dimensions before Cleaning: (13953, 4)
  country_code country_name  year      value
0          ABW        Aruba  1960  11092.675
1          ABW        Aruba  1961  11576.719
2          ABW        Aruba  1962  12713.489
3          ABW        Aruba  1963  12178.107
4          ABW        Aruba  1964  11840.743
5          ABW        Aruba  1965  10623.299

Dimensions before Cleaning: (13953, 3)
  country_name  year      value
0        Aruba  1960  11092.675
1        Aruba  1961  11576.719
2        Aruba  1962  12713.489
3        Aruba  1963  12178.107
4        Aruba  1964  11840.743
5        Aruba  1965  10623.299


### Null/Missing Values

In [7]:
#Finding Missing Values
#Column 1
missing_countries = df["country_name"].isnull().sum()
print("Amount of Missing Values for Column 1:", missing_countries)

#Column 2
missing_year = df["year"].isnull().sum()
print("Amount of Missing Values for Column 2:", missing_year)

#Column 3
missing_value = df["value"].isnull().sum()
print("Amount of Missing Values for Column 3:", missing_value)


Amount of Missing Values for Column 1: 0
Amount of Missing Values for Column 2: 0
Amount of Missing Values for Column 3: 0


## Milk Production Dataset

In [8]:
project_dir = os.getcwd()

# Construct the full path to the CSV file
file_path = os.path.join(project_dir, 'milk.csv')
df = pd.read_csv(file_path)
#df = pd.read_csv('/home/ssg145/final/meat.csv')
initial_size = df.shape[0]

#print(df.head(6))

#CLEANING UP DATA
dimensions = df.shape
print("Dimensions before Cleaning:", dimensions)
print(df.head(6))
df = df.drop(columns=['Code'])

df = df.rename(columns={'Milk | 00001780 || Production | 005510 || tonnes': 'Milk amt'})
df = df.rename(columns={'Entity': 'Country'})

dimensions = df.shape
print("\nDimensions after Cleaning:", dimensions)
print(df.head(6))


Dimensions before Cleaning: (13615, 4)
        Entity Code  Year  Milk | 00001780 || Production | 005510 || tonnes
0  Afghanistan  AFG  1961                                          574900.0
1  Afghanistan  AFG  1962                                          576700.0
2  Afghanistan  AFG  1963                                          646700.0
3  Afghanistan  AFG  1964                                          659900.0
4  Afghanistan  AFG  1965                                          720900.0
5  Afghanistan  AFG  1966                                          750500.0

Dimensions after Cleaning: (13615, 3)
       Country  Year  Milk amt
0  Afghanistan  1961  574900.0
1  Afghanistan  1962  576700.0
2  Afghanistan  1963  646700.0
3  Afghanistan  1964  659900.0
4  Afghanistan  1965  720900.0
5  Afghanistan  1966  750500.0


### Null/Missing Values

In [9]:
#Finding Missing Values
#Column 1
missing_countries = df["Country"].isnull().sum()
print("Amount of Missing Values for Column 1:", missing_countries)

#Column 2
missing_year = df["Year"].isnull().sum()
print("Amount of Missing Values for Column 2:", missing_year)

#Column 3
missing_value = df["Milk amt"].isnull().sum()
print("Amount of Missing Values for Column 3:", missing_value)


Amount of Missing Values for Column 1: 0
Amount of Missing Values for Column 2: 0
Amount of Missing Values for Column 3: 0


In [10]:
#Steps 
#Check for null/missing values 
#average the country values 
#join the datasets 
#Perform analysis (linear regression) 
#Create a prediction model 


### Average Country Values

### Join Datasets Using SQL

### Perform Analysis

### Prediction Model