In [None]:
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression, make_classification, make_blobs
import pandas as pd
import requests
import pymysql
from sqlalchemy import create_engine
# import pandavro as pdx

In [None]:
# Digits dataset
digits = datasets.load_digits()
features = digits.data
vector = digits.target
features[0]

In [None]:
# Dataset Attributes
print(digits.DESCR)

In [None]:
# Creating a Simulated Dataset
# dataset designed to be used with linear regression, make_regression
features, target, coefficients = make_regression(
    n_samples=100, n_features=3, n_informative=3, n_targets=1, noise=0.0, coef=True, random_state=1
)
print('Feature Matrix\n', features[:3])
print('Target Vector\n', target[:3])

In [None]:
# creating a simulated dataset for classification, we can use make_classification:
features, target = make_classification(
    n_samples=100, n_features=3, n_informative=3, n_redundant=0, n_classes=2, weights=[.25, .75], random_state=1
)
print('Feature Matrix\n', features[:3])
print('Target Vector\n', target[:3])

In [None]:
features, target = make_blobs(
    n_samples=100, n_features=2, centers=3, cluster_std=0.5, shuffle=True, random_state=1
)
print('Feature Matrix\n', features[:3])
print('Target Vector\n', target[:3])

In [None]:
plt.scatter(features[:,0], features[:,1], c=target)
plt.show()

In [None]:
# Loading a csv file
CSV_URL = 'https://raw.githubusercontent.com/chrisalbon/sim_data/master/data.csv'
df_csv = pd.read_csv(CSV_URL)
df_csv.head()

In [None]:
car_df = pd.read_csv("../car_data.csv")
car_df.head()

In [None]:
# Loading an Excel File
EXCEL_URL = 'https://raw.githubusercontent.com/chrisalbon/sim_data/master/data.xlsx'
df_excel = pd.read_excel(EXCEL_URL, sheet_name=0, header=0)
df_excel.head()

In [None]:
# Loading JSON file
JSON_URL = 'https://raw.githubusercontent.com/chrisalbon/sim_data/master/data.json'
df_json = pd.read_json(JSON_URL, orient='columns')
df_json.head()

In [None]:
# Loading Parquet File
PARQUET_URL = 'https://machine-learning-python-cookbook.s3.amazonaws.com/data.parquet'
df_parquet = pd.read_parquet(PARQUET_URL)
df_parquet.head()


In [None]:
# Loading an avro File
AVRO_URL = 'https://machine-learning-python-cookbook.s3.amazonaws.com/data.avro'
r = requests.get(AVRO_URL)
with open('data.avro', 'wb') as f:
    f.write(r.content)

# df_avro = pdx.read_avro('data.avro')
# df_avro.head()

In [None]:
# Querying a SQLite database
db_conn = create_engine('sqlite:///sample.db')
df_sql = pd.read_sql_query('SELECT * FROM data', db_conn)
df_sql.head()

In [None]:
# Querying a Remote SQL Database
conn = pymysql.connect(
    host='localhost', user='root', password='', db='db'
)
df_remote_sql = pd.read_sql("select * from data", conn)
df_remote_sql.head()

In [None]:
# Loading Data from a Google Sheet
GOOGLE_SHEET_URL = 'https://docs.google.com/spreadsheets/d/1ehC-9otcAuitqnmWksqt1mOrTRCL38dv0K9UjhwzTOA/export?format=csv"'
df_sheets = pd.read_csv(GOOGLE_SHEET_URL)
df_sheets.head()

In [None]:
# Loading data from an s3 Bucket
s3_uri = "s3://machine-learning-python-cookbook/data.csv"
ACCESS_KEY_ID = "xxxxxxxxxxxxx"
SECRET_ACCESS_KEY = "xxxxxxxxxxxxxxxx"
# Read the CSV into a dataframe
df_s3_bucket = pd.read_csv(
    s3_uri,
    storage_options={
        "key": ACCESS_KEY_ID,
        "secret": SECRET_ACCESS_KEY,
    }
)
df_s3_bucket.head()

In [None]:
# Loading Unstructured Data
TEXT_URL = "https://machine-learning-python-cookbook.s3.amazonaws.com/text.txt"
r = requests.get(TEXT_URL)
with open('text.txt', 'wb') as f:
    f.write(r.content)
with open('text.txt', 'r') as f:
    text = f.read()
print(text)