##### 4. Write a program to download the data from the link given below and then read the data and convert the into the proper structure and return it as a CSV file.

Data Attributes

- Name of Earth Meteorite - string
- id - ID of Earth Meteorite - int
- nametype - string 
- recclass - string
- mass - Mass of Earth Meteorite - float
- fall - 
- year - Year at which Earth Meteorite was hit - datetime format
- reclat - float
- recclong - float
- point coordinates - list of int

In [1]:
# Installing required packages

import requests
import json
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def download_data(url):     # Downloading the data from the provided url.
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.HTTPError as error:
        print(f"HTTP error occurred: {error}")
        return None
    except requests.exceptions.RequestException as error:
        print(f"An error occurred: {error}")
        return None

    raw_data = response.json()

    # Extracting the required data from the raw data.

    data = []
    for i in raw_data:
        json_data = {
            'name': i.get('name', None),
            'id': i.get('id', None),
            'nametype': 1 if i.get('nametype', '') == "Valid" else 0,
            'recclass': i.get('recclass', None),
            'mass': float(i.get('mass', None)) if 'mass' in i else None,
            'fall': i.get('fall', None),
            'year': i.get('year', '')[:10] if 'year' in i else None,
            'reclat': float(i.get('reclat', None)) if 'reclat' in i else None,
            'reclong': float(i.get('reclong', None)) if 'reclong' in i else None,
            'point coordinates': [float(i.get('reclat', None)), float(i.get('reclong', None))] if 'reclat' in i and 'reclong' in i else None
        }
        data.append(json_data)

    return data

# Converting the extracted data into a DataFrame.

def save_csv(data, filename):
    try:
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False)         # Saving the DataFrame to csv format
        print("Data save successfully...👍")
    except Exception as error:
        print(f"An error occurred while saving the file: {error}")

if __name__ == '__main__':
    url = 'https://data.nasa.gov/resource/y77d-th95.json'       # Providing url for the data.
    filename = 'meteorite.csv'
    
    # Download data from the given URL
    data = download_data(url)
    
    if data is not None:
        # Save the data as CSV
        save_csv(data, filename)

Data save successfully...👍


In [3]:
pd.read_csv("meteorite.csv")    # Reading the dataset

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,point coordinates
0,Aachen,1,1,L5,21.0,Fell,1880-01-01,50.77500,6.08333,"[50.775, 6.08333]"
1,Aarhus,2,1,H6,720.0,Fell,1951-01-01,56.18333,10.23333,"[56.18333, 10.23333]"
2,Abee,6,1,EH4,107000.0,Fell,1952-01-01,54.21667,-113.00000,"[54.21667, -113.0]"
3,Acapulco,10,1,Acapulcoite,1914.0,Fell,1976-01-01,16.88333,-99.90000,"[16.88333, -99.9]"
4,Achiras,370,1,L6,780.0,Fell,1902-01-01,-33.16667,-64.95000,"[-33.16667, -64.95]"
...,...,...,...,...,...,...,...,...,...,...
995,Tirupati,24009,1,H6,230.0,Fell,1934-01-01,13.63333,79.41667,"[13.63333, 79.41667]"
996,Tissint,54823,1,Martian (shergottite),7000.0,Fell,2011-01-01,29.48195,-7.61123,"[29.48195, -7.61123]"
997,Tjabe,24011,1,H6,20000.0,Fell,1869-01-01,-7.08333,111.53333,"[-7.08333, 111.53333]"
998,Tjerebon,24012,1,L5,16500.0,Fell,1922-01-01,-6.66667,106.58333,"[-6.66667, 106.58333]"


In [4]:
raw_dataset  = pd.read_csv("meteorite.csv")      # Storing data into raw_dataset variable.

In [5]:
raw_dataset.shape       # Checking shape of the dataset

(1000, 10)

In [6]:
raw_dataset.head()      # Top 5 records of the dataset

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,point coordinates
0,Aachen,1,1,L5,21.0,Fell,1880-01-01,50.775,6.08333,"[50.775, 6.08333]"
1,Aarhus,2,1,H6,720.0,Fell,1951-01-01,56.18333,10.23333,"[56.18333, 10.23333]"
2,Abee,6,1,EH4,107000.0,Fell,1952-01-01,54.21667,-113.0,"[54.21667, -113.0]"
3,Acapulco,10,1,Acapulcoite,1914.0,Fell,1976-01-01,16.88333,-99.9,"[16.88333, -99.9]"
4,Achiras,370,1,L6,780.0,Fell,1902-01-01,-33.16667,-64.95,"[-33.16667, -64.95]"


In [7]:
raw_dataset.isnull().sum()  # Checking Missing Values.

name                  0
id                    0
nametype              0
recclass              0
mass                 28
fall                  0
year                  1
reclat               12
reclong              12
point coordinates    12
dtype: int64

- Their are missing values in the ***mass***, ***year***, ***reclat***, ***reclong*** and ***point coordinates*** column.

In [8]:
raw_dataset.dtypes      # Checking datatypes of each features.

name                  object
id                     int64
nametype               int64
recclass              object
mass                 float64
fall                  object
year                  object
reclat               float64
reclong              float64
point coordinates     object
dtype: object

In [9]:
dataset = raw_dataset       # Copying data into dataset variable for backup purpose.

#### Feature Engineering

In [10]:
dataset.head()

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,point coordinates
0,Aachen,1,1,L5,21.0,Fell,1880-01-01,50.775,6.08333,"[50.775, 6.08333]"
1,Aarhus,2,1,H6,720.0,Fell,1951-01-01,56.18333,10.23333,"[56.18333, 10.23333]"
2,Abee,6,1,EH4,107000.0,Fell,1952-01-01,54.21667,-113.0,"[54.21667, -113.0]"
3,Acapulco,10,1,Acapulcoite,1914.0,Fell,1976-01-01,16.88333,-99.9,"[16.88333, -99.9]"
4,Achiras,370,1,L6,780.0,Fell,1902-01-01,-33.16667,-64.95,"[-33.16667, -64.95]"


In [11]:
dataset.dtypes

name                  object
id                     int64
nametype               int64
recclass              object
mass                 float64
fall                  object
year                  object
reclat               float64
reclong              float64
point coordinates     object
dtype: object

In [12]:
dataset['id'].unique()

array([    1,     2,     6,    10,   370,   379,   390,   392,   398,
         417,   423,   424,   425,   426,   427,   432,   433,   446,
         447,   448,   453,   454,   458,   461,   462,   463,   465,
         466,  2276,  2278, 48915,  2284,  2290,  2294,  2295,  2296,
        2298, 50693,  2301,  2302,  2304,  2305,  2313,  2318,  2320,
        2325,  2329,  2340,  2345, 48954,  2346,  2353,  4883,  4884,
        4885,  4888,  4893,  4899,  4900,  4903,  4905,  4906,  4907,
        4910,  4913,  4917,  4922,  4925,  4926,  4928,  4934,  4935,
        4936,  4937,  4938,  4942,  4944,  4946,  4947,  4948,  4949,
        4954, 44876,  4957,  4974,  4975, 56133,  4976,  4977,  4984,
        4986,  4993,  5005,  5009,  5011, 30443,  5018,  5021,  5023,
        5024,  5026, 48975,  5028,  5029, 47355,  5032,  5034,  5035,
        5037, 36591,  5039,  5040,  5041,  5042,  5043,  5045,  5051,
        5056,  5059,  5060,  5063,  5064,  5065,  5068,  5071,  5072,
        5076,  5090,

In [13]:
len(dataset['id'].unique())

1000

In [14]:
dataset['nametype'].unique()

array([1], dtype=int64)

In [15]:
len(dataset['nametype'].unique())

1

In [16]:
dataset.drop(["nametype"],axis=1, inplace=True)        # Dropping column which is not in use.

In [21]:
dataset['recclass'].unique()

array(['L5', 'H6', 'EH4', 'Acapulcoite', 'L6', 'LL3-6', 'H5', 'L',
       'Diogenite-pm', 'Unknown', 'H4', 'H', 'Iron, IVA', 'CR2-an', 'LL5',
       'CI1', 'L/LL4', 'Eucrite-mmict', 'CV3', 'Ureilite-an',
       'Stone-uncl', 'L3', 'Angrite', 'LL6', 'L4', 'Aubrite',
       'Iron, IIAB', 'Iron, IAB-sLL', 'Iron, ungrouped', 'CM2', 'OC',
       'Mesosiderite-A1', 'LL4', 'C2-ung', 'LL3.8', 'Howardite',
       'Eucrite-pmict', 'Diogenite', 'LL3.15', 'LL3.9', 'Iron, IAB-MG',
       'H/L3.9', 'Iron?', 'Eucrite', 'H4-an', 'L/LL6', 'Iron, IIIAB',
       'H/L4', 'H4-5', 'L3.7', 'LL3.4', 'Martian (chassignite)', 'EL6',
       'H3.8', 'H3-5', 'H5-6', 'Mesosiderite', 'H5-7', 'L3-6', 'H4-6',
       'Ureilite', 'Iron, IID', 'Mesosiderite-A3/4', 'CO3.3', 'H3',
       'EH3/4-an', 'Iron, IIE', 'L/LL5', 'H3.7', 'CBa', 'H4/5', 'H3/4',
       'H?', 'H3-6', 'L3.4', 'Iron, IAB-sHL', 'L3.7-6', 'EH7-an', 'Iron',
       'CR2', 'CO3.2', 'K3', 'L5/6', 'CK4', 'Iron, IIE-an', 'L3.6',
       'LL3.2', 'Pallasite', 'CO

In [22]:
len(dataset['recclass'].unique())

118

In [23]:
dataset.head()

Unnamed: 0,name,id,recclass,mass,fall,year,reclat,reclong,point coordinates
0,Aachen,1,L5,21.0,Fell,1880-01-01,50.775,6.08333,"[50.775, 6.08333]"
1,Aarhus,2,H6,720.0,Fell,1951-01-01,56.18333,10.23333,"[56.18333, 10.23333]"
2,Abee,6,EH4,107000.0,Fell,1952-01-01,54.21667,-113.0,"[54.21667, -113.0]"
3,Acapulco,10,Acapulcoite,1914.0,Fell,1976-01-01,16.88333,-99.9,"[16.88333, -99.9]"
4,Achiras,370,L6,780.0,Fell,1902-01-01,-33.16667,-64.95,"[-33.16667, -64.95]"


In [32]:
dataset.isnull().sum()

name                  0
id                    0
recclass              0
mass                 28
fall                  0
year                  1
reclat               12
reclong              12
point coordinates    12
dtype: int64

In [38]:
dataset['fall'].count()

1000

In [36]:
dataset['fall'].unique()

array(['Fell', 'Found'], dtype=object)