In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, model_selection, tree, pipeline, preprocessing, metrics, cluster
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

In [2]:
import altair as alt

### Data Cleaning and Preprocessing

In [3]:
df_country = pd.read_csv('GlobalLandTemperaturesByCountry.csv')
print(df_country.head(10))
df_country.dtypes

FileNotFoundError: [Errno 2] No such file or directory: 'GlobalLandTemperaturesByCountry.csv'

In [None]:
columns_with_na = df_country.columns[df_country.isna().any()].tolist()
columns_with_na

In [None]:
df_country.dropna(subset=['AverageTemperature', 'AverageTemperatureUncertainty'], inplace=True)
df_country

In [None]:
df_country['dt'] = pd.to_datetime(df_country['dt'],errors='coerce',format='%Y/%m/%d')
df_country['Year'] = df_country['dt'].dt.year
df_country['Month'] = df_country['dt'].dt.month
df_country.rename(columns={'dt': 'Date'}, inplace=True)
df_country

In [None]:
df_cities = pd.read_csv('GlobalLandTemperaturesByMajorCity.csv')
df_cities

In [None]:
df_cities.dropna(subset=['AverageTemperature', 'AverageTemperatureUncertainty'], inplace=True)
df_cities

In [None]:
df_cities['dt'] = pd.to_datetime(df_cities['dt'],errors='coerce',format='%Y/%m/%d')
df_cities['Year'] = df_cities['dt'].dt.year
df_cities['Month'] = df_cities['dt'].dt.month
df_cities.rename(columns={'dt': 'Date'}, inplace=True)
df_cities

In [None]:
df_global = pd.read_csv('GlobalTemperatures.csv')
df_global

In [None]:
df_global.dropna(
    subset=['LandAverageTemperature',
     'LandAverageTemperatureUncertainty',
     'LandMaxTemperature',
     'LandMaxTemperatureUncertainty',
     'LandMinTemperature',
     'LandMinTemperatureUncertainty',
     'LandAndOceanAverageTemperature',
     'LandAndOceanAverageTemperatureUncertainty'],
    inplace=True)

In [None]:
df_global['dt'] = pd.to_datetime(df_global['dt'],errors='coerce',format='%Y/%m/%d')
df_global['Year'] = df_global['dt'].dt.year
df_global['Month'] = df_global['dt'].dt.month
df_global.rename(columns={'dt': 'Date'}, inplace=True)

In [None]:
df_global

In [None]:
df_global.describe()

In [None]:
df_summers = df_global[(df_global['Year'] >= 1800) & ((df_global['Month'] == 6) | (df_global['Month'] == 7) | (df_global['Month'] == 8)) ]
df_summers

In [None]:
# Reshape the feature array
X = df_summers[['Year']]
X = X.values.reshape(-1, 1)  # Reshape to a 2D array

y = df_summers['LandAverageTemperature']
y = y.values  

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# training the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# predictions on the test set
y_pred = model.predict(X_test)

# Plotting the results
plt.figure(figsize=(8, 6))
plt.scatter(X_test, y_test, color='seagreen', label='Actual Temperatures')
plt.plot(X_test, y_pred, color='mediumblue', linewidth=3, label='Linear Regression')
plt.xlabel('Year')
plt.ylabel('Land Average Temperature in Summer')
plt.xticks(range(1850, 2015, 20))
plt.title('Linear Regression for Summer Global Surface Temperatures')
plt.legend()
plt.show()

r_squared = r2_score(y_test, y_pred)
print(f'R-squared: {r_squared}')

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')

In [None]:
# Create Altair chart
chart = alt.Chart(df_summers).mark_circle(color='seagreen').encode(
    x=alt.X('Year:Q', title='Year', scale=alt.Scale(domain=[1850, 2010]), axis=alt.Axis(format='d')),
    y=alt.Y('LandAverageTemperature:Q', title='Land Average Temperature in Summer', 
            scale=alt.Scale(domain=[13, 15])),
    tooltip=['Year', 'LandAverageTemperature']
)

# Add linear regression line
reg_line = alt.Chart(df_summers).mark_line(color='mediumblue').transform_regression(
    'Year', 'LandAverageTemperature'
).encode(
    x=alt.X('Year:Q', scale=alt.Scale(domain=[1850, 2010])),
    y=alt.Y('LandAverageTemperature:Q', scale=alt.Scale(domain=[13, 15]))
)

# Combine chart and regression line
chart1 = chart + reg_line

# Display the chart
chart1.properties(
    title='Linear Regression for Summer Global Surface Temperatures',
    width=600,
    height=400
).interactive()

In [None]:
df_usa3 = df_country[(df_country['Country'] == 'United States') & 
                     (df_country['Year'] >= 1850) & ((df_country['Month'] == 6) | (df_country['Month'] == 7) | (df_country['Month'] == 8))]
df_usa3

In [None]:
df_cities2 = pd.read_csv('GlobalLandTemperaturesByMajorCity.csv')
df_cities2.dropna(subset=['AverageTemperature', 'AverageTemperatureUncertainty'], inplace=True)
df_cities2['dt'] = pd.to_datetime(df_cities2['dt'],errors='coerce',format='%Y/%m/%d')
df_cities2['Year'] = df_cities2['dt'].dt.year
df_cities2['Month'] = df_cities2['dt'].dt.month
df_cities2.rename(columns={'dt': 'Date'}, inplace=True)

In [None]:
df_USA2 = df_cities2[(df_cities2['Country'] == 'United States') & (df_cities2['Year'] >= 1850) & ((df_cities2['Month'] == 6) | (df_cities2['Month'] == 7) | (df_cities2['Month'] == 8))]
df_USA2
# df_summers2 = df_cities[(df_cities['Year'] >= 1800) & ((df_cities['Month'] == 6) | (df_cities['Month'] == 7) | (df_cities['Month'] == 8)) ]
# df_summers2

In [None]:
df_ny = df_cities2[(df_cities2['City'] == 'New York') & (df_cities2['Year'] >= 1913) & ((df_cities2['Month'] == 6) | (df_cities2['Month'] == 7) | (df_cities2['Month'] == 8))]
df_ny

In [None]:
X = df_usa3[['Year']]
X = X.values.reshape(-1, 1)  # Reshape to a 2D array

y = df_usa3['AverageTemperature']
y = y.values  # No need to reshape the target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Plot the results
plt.figure(figsize=(8, 6))
plt.scatter(X_test, y_test, color='seagreen', label='Actual Temperatures')
plt.plot(X_test, y_pred, color='mediumblue', linewidth=3, label='Linear Regression')
plt.xlabel('Year')
plt.ylabel('Land Average Temperature in Summer')
plt.xticks(range(1850, 2015, 20))
plt.title('Linear Regression for Summer Temperatures in Major US Cities')
plt.legend()
plt.show()

r_squared = r2_score(y_test, y_pred)
print(f'R-squared: {r_squared}')

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')

In [None]:
# Create Altair chart
chart = alt.Chart(df_usa3).mark_circle(color='indianred').encode(
    x=alt.X('Year:Q', title='Year', scale=alt.Scale(domain=[1850, 2010]), axis=alt.Axis(format='d')),
    y=alt.Y('AverageTemperature:Q', title='Land Average Temperature in Summer', 
            scale=alt.Scale(domain=[17, 23])),
    tooltip=['Year', 'AverageTemperature']
)

# Add linear regression line
reg_line = alt.Chart(df_usa3).mark_line(color='mediumblue').transform_regression(
    'Year', 'AverageTemperature'
).encode(
    x=alt.X('Year:Q', scale=alt.Scale(domain=[1850, 2010])),
    y=alt.Y('AverageTemperature:Q', scale=alt.Scale(domain=[17, 23]))
)

# Combine chart and regression line
chart2 = chart + reg_line

# Display the chart
chart2.properties(
    title='Linear Regression for Summer Temperatures in Major US Cities',
    width=600,
    height=400
).interactive()

In [None]:
X = df_USA2[['Year']]
X = X.values.reshape(-1, 1)  # Reshape to a 2D array

y = df_USA2['AverageTemperature']
y = y.values  # No need to reshape the target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Plot the results
plt.figure(figsize=(8, 6))
plt.scatter(X_test, y_test, color='seagreen', label='Actual Temperatures')
plt.plot(X_test, y_pred, color='mediumblue', linewidth=3, label='Linear Regression')
plt.xlabel('Year')
plt.ylabel('Land Average Temperature in Summer')
plt.xticks(range(1910, 2015, 20))
plt.title('Linear Regression for Summer Temperatures in Major US Cities')
plt.legend()
plt.show()

r_squared = r2_score(y_test, y_pred)
print(f'R-squared: {r_squared}')

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')

In [None]:
X = df_ny[['Year']]
X = X.values.reshape(-1, 1)  # Reshape to a 2D array

y = df_ny['AverageTemperature']
y = y.values  # No need to reshape the target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Plot the results
plt.figure(figsize=(12, 8))
plt.scatter(X_test, y_test, color='orchid', label='Actual Temperatures')
plt.plot(X_test, y_pred, color='blue', linewidth=3, label='Linear Regression')
plt.xlabel('Year')
plt.ylabel('Land Average Temperature in Summer')
plt.xticks(range(1900, 2015, 20))
plt.title('Linear Regression for Summer Temperatures in Major US Cities')
plt.legend()
plt.show()

r_squared = r2_score(y_test, y_pred)
print(f'R-squared: {r_squared}')

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')

rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')

In [None]:
# df_USA = df_USA[df_USA['Year'] >= 1900]
df_USA

In [None]:
temperature_data = df_USA[['AverageTemperature', 'AverageTemperatureUncertainty', 'Year']]

# Standardize the data
scaler = StandardScaler()
temperature_data_scaled = scaler.fit_transform(temperature_data)

# Apply K-Means clustering
num_clusters = 3  # Adjust the number of clusters based on your analysis
kmeans = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
clusters = kmeans.fit_predict(temperature_data_scaled)

# Add cluster labels to the original DataFrame
df_USA['Cluster'] = clusters

# Visualize the clusters
plt.figure(figsize=(8, 6))  # Set the figure size to 12x8 inches
plt.scatter(df_USA['Year'], df_USA['AverageTemperature'], c=df_USA['Cluster'], cmap='viridis', alpha=0.7)  # Adjust alpha for transparency
plt.xlabel('Year')
plt.ylabel('Average Temperature')
plt.title('K-Means Clustering of US City Temperatures')
plt.show()

In [None]:
cluster1 = df_USA[df_USA['Cluster'] == 0]
cluster2 = df_USA[df_USA['Cluster'] == 1]
cluster3 = df_USA[df_USA['Cluster'] == 2]

In [None]:
df_USA3 = df_country[(df_country['Country'] == 'United States') & ((df_country['Month'] == 6) 
                    | (df_country['Month'] == 7) | (df_country['Month'] == 8)) ]
df_USA3

In [None]:
temperature_data = df_USA2[['AverageTemperature', 'AverageTemperatureUncertainty', 'Year']]

# Standardize the data
scaler = StandardScaler()
temperature_data_scaled = scaler.fit_transform(temperature_data)

# Apply K-Means clustering
num_clusters = 2  # Adjust the number of clusters based on your analysis
kmeans = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
clusters = kmeans.fit_predict(temperature_data_scaled)

# Add cluster labels to the original DataFrame
df_USA2['Cluster'] = clusters

# Visualize the clusters
plt.figure(figsize=(8, 6))  # Set the figure size to 12x8 inches
plt.scatter(df_USA2['Year'], df_USA2['AverageTemperature'], c=df_USA2['Cluster'], cmap='viridis', alpha=0.7)  # Adjust alpha for transparency
plt.xlabel('Year')
plt.ylabel('Average Temperature')
plt.title('K-Means Clustering of US City Temperatures')
plt.show()

# Compute silhouette score
silhouette_avg = silhouette_score(temperature_data_scaled, clusters)
print(f"Silhouette Score: {silhouette_avg}")

inertia_value = kmeans.inertia_
print(f"Inertia: {inertia_value}")

In [None]:
temperature_data = df_USA3[['AverageTemperature', 'AverageTemperatureUncertainty', 'Year']]

# Standardize the data
scaler = StandardScaler()
temperature_data_scaled = scaler.fit_transform(temperature_data)

# Apply K-Means clustering
num_clusters = 2  # Adjust the number of clusters based on your analysis
kmeans = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
clusters = kmeans.fit_predict(temperature_data_scaled)

# Add cluster labels to the original DataFrame
df_USA3['Cluster'] = clusters

# Visualize the clusters
plt.figure(figsize=(8, 6))  # Set the figure size to 12x8 inches
plt.scatter(df_USA3['Year'], df_USA3['AverageTemperature'], c=df_USA3['Cluster'], cmap='viridis', alpha=0.7)  # Adjust alpha for transparency
plt.xlabel('Year')
plt.ylabel('Average Temperature')
plt.title('K-Means Clustering of US Temperatures')
plt.show()

silhouette_avg = silhouette_score(temperature_data_scaled, clusters)
print(f"Silhouette Score: {silhouette_avg}")

inertia_value = kmeans.inertia_
print(f"Inertia: {inertia_value}")

In [None]:
numeric_columns = df_USA.select_dtypes(include=['float64', 'int64']).columns
df_numeric = df_USA[numeric_columns]

# Preprocess the data with StandardScaler
scaler = preprocessing.StandardScaler()
df_scaled = scaler.fit_transform(df_numeric)

# Create the model
model = cluster.KMeans(n_clusters=3)
model.fit(df_scaled)

# Find clusters in the data
cluster_ids = model.predict(df_scaled)

# Calculate silhouette scores for the clusters found
# features = ['exam_score_main_objects', 'study_time_main_objects', 'knowledge_level', 'study_time_related_objects']
# X = df[features]
silhouette_scores = metrics.silhouette_samples(df_scaled, cluster_ids)
mean_silhouette = metrics.silhouette_score(df_scaled, cluster_ids)
print(mean_silhouette)

In [None]:
df_country['Country'].value_counts()