In [106]:
#read csv
import pandas as pd
import seaborn as sn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import warnings

In [107]:
ufos = pd.read_csv("ufos.csv")

In [108]:
ufos.head(3)

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700.0,45 minutes,This event took place in early fall around 194...,4/27/2004,29.883056,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200.0,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20.0,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667


In [109]:
ufos.dtypes

datetime                 object
city                     object
state                    object
country                  object
shape                    object
duration (seconds)      float64
duration (hours/min)     object
comments                 object
date posted              object
latitude                float64
longitude               float64
dtype: object

In [110]:
#Convert the ufos data to a small dataframe with fresh titles.
ufos = pd.DataFrame({'Seconds': ufos['duration (seconds)'], 'Country': ufos['country'],'Latitude': ufos['latitude'],'Longitude': ufos['longitude']})

In [111]:
#check for the new dataframe
ufos.head(3)

Unnamed: 0,Seconds,Country,Latitude,Longitude
0,2700.0,us,29.883056,-97.941111
1,7200.0,,29.38421,-98.581082
2,20.0,gb,53.2,-2.916667


In [113]:
#Check the unique values in the Country field.
ufos.Country.unique()

array(['us', nan, 'gb', 'ca', 'au', 'de'], dtype=object)

In [114]:
#reducing the amount of data to be dealt with by dropping null values and only importing sightings between 1-60 seconds
ufos.dropna(inplace=True)
ufos = ufos[(ufos['Seconds'] >= 1) & (ufos['Seconds'] <= 60)]
ufos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25863 entries, 2 to 80330
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Seconds    25863 non-null  float64
 1   Country    25863 non-null  object 
 2   Latitude   25863 non-null  float64
 3   Longitude  25863 non-null  float64
dtypes: float64(3), object(1)
memory usage: 1010.3+ KB


In [115]:
ufos

Unnamed: 0,Seconds,Country,Latitude,Longitude
2,20.0,gb,53.200000,-2.916667
3,20.0,us,28.978333,-96.645833
14,30.0,us,35.823889,-80.253611
23,60.0,us,45.582778,-122.352222
24,3.0,gb,51.783333,-0.783333
...,...,...,...,...
80320,60.0,us,33.209722,-87.569167
80321,3.0,us,36.529722,-87.359444
80323,60.0,us,29.651389,-82.325000
80326,20.0,us,34.101389,-84.519444


In [116]:
# converting the text values for countries to a number using LabelEncoder
ufos['Country'] = LabelEncoder().fit_transform(ufos['Country'])
warnings.filterwarnings("ignore")# to ignore warnings coming from the use of labelencoder
ufos.head()

Unnamed: 0,Seconds,Country,Latitude,Longitude
2,20.0,3,53.2,-2.916667
3,20.0,4,28.978333,-96.645833
14,30.0,4,35.823889,-80.253611
23,60.0,4,45.582778,-122.352222
24,3.0,3,51.783333,-0.783333


In [148]:
ufos.sample(5)

Unnamed: 0,Seconds,Country,Latitude,Longitude
8337,40.0,4,32.715278,-117.156389
45051,15.0,1,43.666667,-79.416667
42795,1.0,4,41.610556,-86.7225
6783,2.0,4,26.0625,-80.233333
22286,60.0,4,47.658889,-117.425


In [117]:
#Selecting the model features and target
from sklearn.model_selection import train_test_split
Selected_features = ['Seconds','Latitude','Longitude']
X = ufos[Selected_features]
y = ufos['Country']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [130]:
#training the model using logistic regression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print(classification_report(y_test, predictions))
print('Predicted labels: ', predictions)
print('Accuracy: ', accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       0.83      0.23      0.36       250
           2       1.00      1.00      1.00         8
           3       1.00      1.00      1.00       131
           4       0.96      1.00      0.98      4743

    accuracy                           0.96      5173
   macro avg       0.96      0.85      0.87      5173
weighted avg       0.96      0.96      0.95      5173

Predicted labels:  [4 4 4 ... 3 4 4]
Accuracy:  0.9605644693601392


In [134]:
#adding the model into pickle
import pickle
model_filename = "ufo-model.pkl"
pickle.dump(model, open(model_filename, "wb"))

In [155]:
model = pickle.load(open('ufo-model.pkl','rb'))
print(model.predict([[15.0, 43.666667, -79.416667]]))

[4]
