In [1]:
import pandas as pd
import dask.dataframe as ddf
import hvplot.dask as plt


# load
fpath = f'scrubbed.csv'
colnames = [
        'datetime',
        'city',
        'state',
        'country',
        'shape',
        'duration',
        'duration (hours/min)',
        'comments',
        'date posted',
        'latitude',
        'longitude',
        'empty'
    ]

with open(fpath,'r') as file:
    df = pd.read_csv(file, names=colnames, skiprows=1)

to_drop = [
    'empty', 
    'duration (hours/min)', 
    'date posted'
]
df.drop(labels=to_drop, axis=1, inplace = True)
df.head()

  df = pd.read_csv(file, names=colnames, skiprows=1)


Unnamed: 0,datetime,city,state,country,shape,duration,comments,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,This event took place in early fall around 194...,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1949 Lackland AFB&#44 TX. Lights racing acros...,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,Green/Orange circular disc over Chester&#44 En...,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,My older brother and twin sister were leaving ...,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,AS a Marine 1st Lt. flying an FJ4B fighter/att...,21.4180556,-157.803611


In [3]:
import pandas as pd
import re

# Function to split datetime into components
def split_datetime(df, column_name):
    # Regex pattern
    pattern = r"(\d{2})/(\d{2})/(\d{4})\s(\d{2}):(\d{2})"
    
    # Lists to store the extracted components
    days, months, years, hours, minutes = [], [], [], [], []
    
    # Iterate over the column and apply regex
    for datetime_str in df[column_name]:
        match = re.match(pattern, datetime_str)
        if match:
            day, month, year, hour, minute = match.groups()
            days.append(day)
            months.append(month)
            years.append(year)
            hours.append(hour)
            minutes.append(minute)
        else:
            days.append(None)
            months.append(None)
            years.append(None)
            hours.append(None)
            minutes.append(None)
    
    # Add the extracted components as new columns
    df['day'] = days
    df['month'] = months
    df['year'] = years
    df['hour'] = hours
    df['minute'] = minutes
    
    return df

# Apply the function to the DataFrame
df = split_datetime(df, 'datetime')

print(df)

               datetime                  city state country     shape  \
0      10/10/1949 20:30            san marcos    tx      us  cylinder   
1      10/10/1949 21:00          lackland afb    tx     NaN     light   
2      10/10/1955 17:00  chester (uk/england)   NaN      gb    circle   
3      10/10/1956 21:00                  edna    tx      us    circle   
4      10/10/1960 20:00               kaneohe    hi      us     light   
...                 ...                   ...   ...     ...       ...   
80327  09/09/2013 21:15             nashville    tn      us     light   
80328  09/09/2013 22:00                 boise    id      us    circle   
80329  09/09/2013 22:00                  napa    ca      us     other   
80330  09/09/2013 22:20                vienna    va      us    circle   
80331  09/09/2013 23:00                edmond    ok      us     cigar   

      duration                                           comments    latitude  \
0         2700  This event took place in e

In [5]:
# Select specific columns
df1 = df[['longitude', 'latitude', 'day', 'month', 'year', 'hour', 'minute']]

print(df1)




        longitude    latitude day month  year hour minute
0      -97.941111  29.8830556  10    10  1949   20     30
1      -98.581082    29.38421  10    10  1949   21     00
2       -2.916667        53.2  10    10  1955   17     00
3      -96.645833  28.9783333  10    10  1956   21     00
4     -157.803611  21.4180556  10    10  1960   20     00
...           ...         ...  ..   ...   ...  ...    ...
80327  -86.784444   36.165833  09    09  2013   21     15
80328 -116.202500   43.613611  09    09  2013   22     00
80329 -122.284444   38.297222  09    09  2013   22     00
80330  -77.265556   38.901111  09    09  2013   22     20
80331  -97.477778   35.652778  09    09  2013   23     00

[80332 rows x 7 columns]


In [58]:
# Convert columns to numeric types and coerce errors to NaN using .loc
df1.loc[:, 'longitude'] = pd.to_numeric(df1['longitude'], errors='coerce')
df1.loc[:, 'latitude'] = pd.to_numeric(df1['latitude'], errors='coerce')
df1.loc[:, 'day'] = pd.to_numeric(df1['day'], errors='coerce')
df1.loc[:, 'month'] = pd.to_numeric(df1['month'], errors='coerce')
df1.loc[:, 'year'] = pd.to_numeric(df1['year'], errors='coerce')
df1.loc[:, 'hour'] = pd.to_numeric(df1['hour'], errors='coerce')
df1.loc[:, 'minute'] = pd.to_numeric(df1['minute'], errors='coerce')

# Drop rows with any NaN values
df1_cleaned = df1.dropna()

print(df1_cleaned.dtypes)

longitude    float64
latitude      object
day           object
month         object
year          object
hour          object
minute        object
dtype: object


In [9]:
##Random Forest

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [13]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

ModuleNotFoundError: No module named 'graphviz'

In [15]:
# Features and targets
X = df1_cleaned[['day', 'month', 'year']]
y = df1_cleaned[['longitude', 'latitude', 'hour', 'minute']]

In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)


In [19]:
# Train the model
rf.fit(X_train, y_train)

In [21]:
# Prepare the input data for prediction
test_data = pd.DataFrame({
    'day': [5],
    'month': [11],
    'year': [2024]
})

# Make predictions
predictions = rf.predict(test_data)

# Convert predictions to a DataFrame for better readability
predictions_df = pd.DataFrame(predictions, columns=['longitude', 'latitude', 'hour', 'minute'])

print(predictions_df)

   longitude  latitude       hour     minute
0 -60.954127  27.39091  17.168892  23.249875


In [23]:
#Accuracy of the model
from sklearn.metrics import accuracy_score
y_pred = rf.predict(X_test)




In [94]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")

Mean Absolute Error (MAE): 12.450286118187941
Mean Squared Error (MSE): 467.74841493405233
R-squared (R²): 0.01150299702234997


In [96]:

# Create a map centered around the predicted coordinates
import folium

# Predicted coordinates (example values)
predicted_longitude = -60.954127
predicted_latitude = 27.39091

# Create a map centered around the predicted coordinates
map = folium.Map(location=[predicted_latitude, predicted_longitude], zoom_start=12)

# Add a marker for the predicted location
folium.Marker(
    location=[predicted_latitude, predicted_longitude],
    popup='Predicted Location',
    icon=folium.Icon(color='blue', icon='info-sign')
).add_to(map)

# Save the map to an HTML file
map.save('predicted_location_map12.html')

In [27]:
####Polynomial regression

In [29]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [31]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
# Transform features to polynomial features
degree = 3  # Degree of the polynomial
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [46]:
# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train_poly, y_train)

In [48]:
# Make predictions on the test set
y_pred = model.predict(X_test_poly)

In [50]:
# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")

Mean Absolute Error (MAE): 12.450286118187941
Mean Squared Error (MSE): 467.74841493405233
R-squared (R²): 0.01150299702234997


In [86]:
# Example prediction for a specific day, month, and year
test_data = pd.DataFrame({
    'day': [5],
    'month': [11],
    'year': [2024]
})
test_data_poly = poly.transform(test_data)
prediction = model.predict(test_data_poly)

# Convert prediction to a DataFrame for better readability
prediction_df = pd.DataFrame(prediction, columns=['longitude', 'latitude', 'hour', 'minute'])
print(prediction_df)

   longitude   latitude       hour     minute
0 -89.927808  37.949773  19.881976  18.221968


In [90]:
import folium
# Create a map centered around the predicted coordinates
map = folium.Map(location=prediction_df(['latitude']), prediction_df(['longitude']), zoom_start=12)
# Add a marker for the predicted location
folium.Marker(
    location=prediction_df(['latitude']), prediction_df(['longitude']),
    popup='Predicted Location',
    icon=folium.Icon(color='blue', icon='info-sign')
).add_to(map)

# Save the map to an HTML file
map.save('predicted_location_map1.html')

SyntaxError: positional argument follows keyword argument (3908573901.py, line 3)

In [92]:

# Create a map centered around the predicted coordinates
import folium

# Predicted coordinates (example values)
predicted_longitude = -89.927808
predicted_latitude = 37.949773

# Create a map centered around the predicted coordinates
map = folium.Map(location=[predicted_latitude, predicted_longitude], zoom_start=12)

# Add a marker for the predicted location
folium.Marker(
    location=[predicted_latitude, predicted_longitude],
    popup='Predicted Location',
    icon=folium.Icon(color='blue', icon='info-sign')
).add_to(map)

# Save the map to an HTML file
map.save('predicted_location_map1.html')

In [None]:
#linear Regression

In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [62]:
# Feature selection
X = df1_cleaned[['day', 'month', 'year']]
y_longitude = df1_cleaned['longitude']
y_latitude = df1_cleaned['latitude']
y_hour = df1_cleaned['hour']
y_minute = df1_cleaned['minute']

In [64]:
# Split the data
X_train, X_test, y_long_train, y_long_test = train_test_split(X, y_longitude, test_size=0.3, random_state=42)
X_train, X_test, y_lat_train, y_lat_test = train_test_split(X, y_latitude, test_size=0.3, random_state=42)
X_train, X_test, y_hour_train, y_hour_test = train_test_split(X, y_hour, test_size=0.3, random_state=42)
X_train, X_test, y_min_train, y_min_test = train_test_split(X, y_minute, test_size=0.3, random_state=42)

In [66]:
# Initialize the model
model_long = LinearRegression()
model_lat = LinearRegression()
model_hour = LinearRegression()
model_min = LinearRegression()

In [70]:
# Train the model
model_long.fit(X_train, y_long_train)
model_lat.fit(X_train, y_lat_train)
model_hour.fit(X_train, y_hour_train)
model_min.fit(X_train, y_min_train)

In [72]:
# Make predictions
y_long_pred = model_long.predict(X_test)
y_lat_pred = model_lat.predict(X_test)
y_hour_pred = model_hour.predict(X_test)
y_min_pred = model_min.predict(X_test)

In [74]:
# Evaluate the model
print('Longitude R^2:', r2_score(y_long_test, y_long_pred))
print('Latitude R^2:', r2_score(y_lat_test, y_lat_pred))
print('Hour R^2:', r2_score(y_hour_test, y_hour_pred))
print('Minute R^2:', r2_score(y_min_test, y_min_pred))

Longitude R^2: 0.0009096409801185867
Latitude R^2: -0.0007015856769991835
Hour R^2: 0.0017972354625522824
Minute R^2: 0.025532780507647512


In [76]:
# Example date
date = {'day': 5, 'month': 11, 'year': 2024}
X_new = np.array([[date['day'], date['month'], date['year']]])

# Predict using the trained models
longitude_pred = model_long.predict(X_new)
latitude_pred = model_lat.predict(X_new)
hour_pred = model_hour.predict(X_new)
minute_pred = model_min.predict(X_new)

print(f"Predicted Longitude: {longitude_pred[0]}")
print(f"Predicted Latitude: {latitude_pred[0]}")
print(f"Predicted Hour: {hour_pred[0]}")
print(f"Predicted Minute: {minute_pred[0]}")

Predicted Longitude: -87.264247590311
Predicted Latitude: 37.882713971191166
Predicted Hour: 15.819081472928517
Predicted Minute: 23.329999804606132




In [78]:
import folium

In [84]:
# Create a map centered around the predicted coordinates
map = folium.Map(location=[latitude_pred, longitude_pred], zoom_start=12)
# Add a marker for the predicted location
folium.Marker(
    location=[latitude_pred, longitude_pred],
    popup='Predicted Location',
    icon=folium.Icon(color='blue', icon='info-sign')
).add_to(map)

# Save the map to an HTML file
map.save('predicted_location_map.html')