<h3 style="color:purple">Train initial regression model with SCISAT data</h3>

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn import linear_model

In [4]:
df = pd.read_csv('co2.csv')
df

Unnamed: 0,Alt_Mean,std CO2,date,lat,long
0,0.000266,0.000156,9/11/2020,-80.79,-137.12
1,0.000274,0.000160,9/11/2020,58.52,80.06
2,0.000282,0.000151,9/11/2020,-80.77,-161.25
3,0.000271,0.000162,9/11/2020,58.67,55.66
4,0.000282,0.000143,9/11/2020,-80.74,174.63
...,...,...,...,...,...
98507,0.000250,0.000148,2/1/2004,-21.13,74.99
98508,0.000240,0.000149,2/1/2004,-20.83,50.43
98509,0.000297,0.000114,2/1/2004,-19.94,-23.24
98510,0.000251,0.000145,2/1/2004,-19.64,-47.80


In [5]:
def date_to_float(date_str):
    try:
        # Parse the date string into a datetime object
        date_obj = datetime.strptime(date_str, '%m/%d/%Y')  # Adjust the format if needed
        # Convert the datetime object to a Unix timestamp (float)
        timestamp = date_obj.timestamp()
        return timestamp
    except ValueError:
        # Handle invalid date strings here if necessary
        return None

df['date'] = df['date'].apply(date_to_float)
print(df)

       Alt_Mean   std CO2          date    lat    long
0      0.000266  0.000156  1.599782e+09 -80.79 -137.12
1      0.000274  0.000160  1.599782e+09  58.52   80.06
2      0.000282  0.000151  1.599782e+09 -80.77 -161.25
3      0.000271  0.000162  1.599782e+09  58.67   55.66
4      0.000282  0.000143  1.599782e+09 -80.74  174.63
...         ...       ...           ...    ...     ...
98507  0.000250  0.000148  1.075594e+09 -21.13   74.99
98508  0.000240  0.000149  1.075594e+09 -20.83   50.43
98509  0.000297  0.000114  1.075594e+09 -19.94  -23.24
98510  0.000251  0.000145  1.075594e+09 -19.64  -47.80
98511  0.000254  0.000147  1.075594e+09 -19.33  -72.35

[98512 rows x 5 columns]


In [6]:
columns_with_nan = df.columns[df.isna().any()].tolist()

if len(columns_with_nan) > 0:
    print("Columns with NaN values:")
    for column_name in columns_with_nan:
        print(column_name)
else:
    print("No columns have NaN values.")

Columns with NaN values:
Alt_Mean
std CO2


**Data Preprocessing: Fill NA values with median value of a column**

In [7]:
df.date = df.date.fillna(df.date.median())
df.Alt_Mean = df.Alt_Mean.fillna(df.Alt_Mean.median())
df['std CO2'] = df['std CO2'].fillna(df['std CO2'].median())

In [8]:
reg = linear_model.LinearRegression()
reg.fit(df.drop('std CO2',axis='columns'),df['std CO2'])

In [9]:
reg.coef_

array([-5.48224178e-01,  5.04258446e-14,  1.60705712e-10, -3.28683758e-09])

In [10]:
reg.intercept_

0.000219313115659189

**Test initial model**

In [11]:
reg.predict([[df.Alt_Mean.median(), date_to_float('9/16/2045'), -80.79, -137.12]])



array([0.00019149])

**Test initial model**

In [12]:
reg.predict([[df.Alt_Mean.median(), date_to_float('9/20/2020'), -80.79, -137.12]])



array([0.00015172])