In [None]:
import pandas as pd
import numpy as np
import sqlite3
import os

print(os.getcwd())

# Download & Clean Each Data Set

We have 3 different data sets that we are using for the project.
1. Coffee Quality Data
- From the Coffee Quality Data we extracted the Country Name, No. of Bags, Harvest Year and Bag Weight.
2. Meat Production Data
- From the Meat Dataset we focused on analyzing Country, Year, and Production in tonns
3. CO2 Emmisions
- From the CO2 Emmisions dataset we focused on the Country, Year, and the CO2 Emissions in kiloton (kt)
4. Milk Production Dataset
- From the Milk Production Dataset 

## Coffee Quality Dataset

In [None]:
project_dir = os.getcwd()

# Construct the full path to the CSV file
file_path = os.path.join(project_dir, 'coffee.csv')

# Read the CSV file
coffee_df = pd.read_csv(file_path)

#CLEANING UP THE DATA (getting rid of unneccessary info ie. columns)
dimensions = coffee_df.shape
print("Dimensions before Cleaning:", dimensions)
print(coffee_df.head(6))

coffee_df = coffee_df[["Country of Origin", "Number of Bags", "Bag Weight", "Harvest Year"]]#Taking only the columns we need

dimensions = coffee_df.shape
print("\nDimensions after Cleaning:",dimensions)
print(coffee_df.head(6))

#May want to drop all the columns except the country name, no. of bags, harvest year and bag weight.

### Null/Missing Values

In [None]:
#Finding Missing Values
#Column 1
missing_countries = coffee_df["Country of Origin"].isnull().sum()
print("Amount of Missing Values for Column 1:", missing_countries)

#Column 2
missing_bags = coffee_df["Number of Bags"].isnull().sum()
print("Amount of Missing Values for Column 2:", missing_bags)

#Column 3
missing_weight = coffee_df["Bag Weight"].isnull().sum()
print("Amount of Missing Values for Column 3:", missing_weight)

#Column 4
missing_year = coffee_df["Harvest Year"].isnull().sum()
print("Amount of Missing Values for Column 4:", missing_year)

#Finding Null Values

## Meat Dataset

In [None]:
project_dir = os.getcwd()

# Construct the full path to the CSV file
file_path = os.path.join(project_dir, 'meat.csv')
meat_df = pd.read_csv(file_path)
#df = pd.read_csv('/home/ssg145/final/meat.csv')
initial_size = meat_df.shape[0]

dimensions = meat_df.shape
print("Dimensions before Cleaning:", dimensions)
print(meat_df.head(6))

#CLEANING UP DATA
meat_df = meat_df.drop(columns=['Code'])
#print(df.head(6))

meat_df = meat_df.rename(columns={'Meat, total | 00001765 || Production | 005510 || tonnes': 'Production amt'})

dimensions = meat_df.shape
print("\nDimensions after Cleaning:", dimensions)
print(meat_df.head(6))


### Null/Missing Values

In [None]:
#Finding Missing Values
#Column 1
missing_countries = meat_df["Country"].isnull().sum()
print("Amount of Missing Values for Column 1:", missing_countries)

#Column 2
missing_year = meat_df["Year"].isnull().sum()
print("Amount of Missing Values for Column 2:", missing_year)

#Column 3
missing_production = meat_df["Production amt"].isnull().sum()
print("Amount of Missing Values for Column 3:", missing_production)

#Finding Null Values: there are country values that are continents or low income/high income countries
print(meat_df.shape)
bad_values = ["Asia", "North America", "Africa", "South America", "Europe", "Antartica", "World", "Lower-middle-income countries", "Upper-middle-income countries", "High-income countries", "Low-income countries"]
meat_df = meat_df[~meat_df["Country"].str.contains("|".join(bad_values), case = False, na = False)]
print(meat_df.shape)

## CO2 Emmissions Dataset

In [None]:
#df = pd.read_csv('./final/co2_country.csv')
project_dir = os.getcwd()

# Construct the full path to the CSV file
file_path = os.path.join(project_dir, 'co2_country.csv')
df_co2 = pd.read_csv(file_path)

dimensions = df_co2.shape
print("Dimensions before Cleaning:", dimensions)
print(df_co2.head(6))

#Clean the data (get rid of the country codes)
df_co2 = df_co2.iloc[:, 1:]

dimensions = df_co2.shape
print("\nDimensions before Cleaning:", dimensions)
print(df_co2.head(6))

### Null/Missing Values

In [None]:
#Finding Missing Values
#Column 1
missing_countries = df_co2["country_name"].isnull().sum()
print("Amount of Missing Values for Column 1:", missing_countries)

#Column 2
missing_year = df_co2["year"].isnull().sum()
print("Amount of Missing Values for Column 2:", missing_year)

#Column 3
missing_value = df_co2["value"].isnull().sum()
print("Amount of Missing Values for Column 3:", missing_value)


## Milk Production Dataset

In [None]:
project_dir = os.getcwd()

# Construct the full path to the CSV file
file_path = os.path.join(project_dir, 'milk.csv')
milk_df = pd.read_csv(file_path)
#df = pd.read_csv('/home/ssg145/final/meat.csv')
initial_size = milk_df.shape[0]

#print(df.head(6))

#CLEANING UP DATA
dimensions = milk_df.shape
print("Dimensions before Cleaning:", dimensions)
print(milk_df.head(6))
milk_df = milk_df.drop(columns=['Code'])

milk_df = milk_df.rename(columns={'Milk | 00001780 || Production | 005510 || tonnes': 'Milk amt'})
milk_df = milk_df.rename(columns={'Entity': 'Country'})

dimensions = milk_df.shape
print("\nDimensions after Cleaning:", dimensions)
print(milk_df.head(6))


### Null/Missing Values

In [None]:
#Finding Missing Values
#Column 1
missing_countries = milk_df["Country"].isnull().sum()
print("Amount of Missing Values for Column 1:", missing_countries)

#Column 2
missing_year = milk_df["Year"].isnull().sum()
print("Amount of Missing Values for Column 2:", missing_year)

#Column 3
missing_value = milk_df["Milk amt"].isnull().sum()
print("Amount of Missing Values for Column 3:", missing_value)


In [None]:
#Steps 
#Check for null/missing values 
#average the country values 
#join the datasets 
#Perform analysis (linear regression) 
#Create a prediction model 


### Average Country Values

### Join Datasets Using SQL

### Perform Analysis

### Prediction Model