In [349]:
import pandas as pd
import sqlite3
import sqlalchemy as sa
from sqlalchemy.types import *
from sklearn.preprocessing import LabelEncoder

In [365]:
connection = sqlite3.connect("database.db")

In [453]:
engine = sa.create_engine('sqlite:////home/roman/BI/python/BI_2020-2021_Python/SQL_homework/database.db')

In [454]:
connection = engine.connect()

# Data preprocessing

In [428]:
# It is seems like '@' was not the best separator, because it is present in nearly 20 titles
df = pd.read_csv("results_upd.csv", sep="@", error_bad_lines=False, warn_bad_lines=False, na_values="None").dropna()
df.id = df.id.astype(int)
df.score = df.score.astype(float)
df.rating_count = df.rating_count.astype(int)
df.rank = df["rank"].astype(int)
df.popularity = df.popularity.astype(int)
df = df.drop(columns=["favorites"])
df.year = df.year.astype(int)
df.episodes = df.episodes.where(df.episodes != "Unknown", 0)
df.genres = df.genres.apply(lambda x: x.split(","))
df["coarse_genre"] = df.genres.apply(lambda x: x[0])
df["all_genres"] = df.genres.apply(lambda x: " ".join(x))
df.studio = df.studio.where(lambda x: x != "add some")

In [431]:
anime_table_schema = {"id": sa.Column("id", INT(), sa.ForeignKey("descriptions.id"),
                                            sa.ForeignKey("ranks.id"), primary_key=True, autoincrement=True),
                      "title_english": sa.Column("title_english", VARCHAR(100)),
                      "title_japanese": sa.Column("title_japanese", VARCHAR(100))}

description_table_schema = {"id": sa.Column("id", INT(), primary_key=True, autoincrement=True),
                            "synopsys": sa.Column("synopsys", TEXT(), nullable=True),
                            "type": sa.Column("type", VARCHAR(10), nullable=True),
                            "episodes": sa.Column("episodes", INT(), nullable=True),
                            "rating": sa.Column("rating", VARCHAR(10), nullable=True),
                            "duration": sa.Column("duration", VARCHAR(20), nullable=True),
                            "season": sa.Column("season", VARCHAR(10), nullable=True),
                            "year": sa.Column("year", INT(), nullable=True),
                            "coarse_genre": sa.Column("coarse_genre", INT(),
                                                      sa.ForeignKey("coarse_genres.id"),
                                                      nullable=True),
                            "all_genres": sa.Column("all_genres", INT(),
                                                      sa.ForeignKey("all_genres.id"),
                                                      nullable=True)}

ranks_table_schema = {"id": sa.Column("id", INT(), sa.ForeignKey("anime.id"), primary_key=True, autoincrement=True),
                      "score": sa.Column("score", DECIMAL(4, 2)),
                      "rating_count": sa.Column("rating_count", INT(), nullable=True),
                      "rank": sa.Column("rank", INT(), nullable=True),
                      "popularity": sa.Column("popularity", INT(), nullable=True)}

coarse_genre_table_schema = {"id": sa.Column("id", INT(), primary_key=True, autoincrement=True),
                             "coarse_genre": sa.Column("coarse_genre", VARCHAR(20), nullable=True)}

all_genres_table_schema = {"id": sa.Column("id", INT(), primary_key=True, autoincrement=True),
                           "all_genres": sa.Column("all_genres", TEXT(), nullable=True)}

In [455]:
a = sa.schema.MetaData(bind=connection)

In [456]:
anime_table_schema = sa.Table("anime", a, sa.Column("id", INT(), sa.ForeignKey("descriptions.id"),
                              sa.ForeignKey("ranks.id"), primary_key=True, autoincrement=True),
                              sa.Column("title_english", VARCHAR(100)),
                              sa.Column("title_japanese", VARCHAR(100)))

description_table_schema = sa.Table("descriptions", a, sa.Column("id", INT(), primary_key=True, autoincrement=True),
                                    sa.Column("synopsys", TEXT(), nullable=True),
                                    sa.Column("type", VARCHAR(10), nullable=True),
                                    sa.Column("episodes", INT(), nullable=True),
                                    sa.Column("rating", VARCHAR(10), nullable=True),
                                    sa.Column("duration", VARCHAR(20), nullable=True),
                                    sa.Column("season", VARCHAR(10), nullable=True),
                                    sa.Column("year", INT(), nullable=True),
                                    sa.Column("coarse_genre", INT(),
                                                      sa.ForeignKey("coarse_genres.id"),
                                                      nullable=True),
                                    sa.Column("all_genres", INT(),
                                                      sa.ForeignKey("all_genres.id"),
                                                      nullable=True))

ranks_table_schema = sa.Table("ranks", a, sa.Column("id", INT(), sa.ForeignKey("anime.id"), primary_key=True, autoincrement=True),
                              sa.Column("score", DECIMAL(4, 2)),
                              sa.Column("rating_count", INT(), nullable=True),
                              sa.Column("rank", INT(), nullable=True),
                              sa.Column("popularity", INT(), nullable=True))

coarse_genre_table_schema = sa.Table("coarse_genres", a, sa.Column("id", INT(), primary_key=True, autoincrement=True),
                                     sa.Column("coarse_genre", VARCHAR(20), nullable=True))

all_genres_table_schema = sa.Table("all_genres", a, sa.Column("id", INT(), primary_key=True, autoincrement=True),
                                   sa.Column("all_genres", TEXT(), nullable=True))

In [458]:
anime_table_schema.create()
description_table_schema.create()
ranks_table_schema.create()
coarse_genre_table_schema.create()
all_genres_table_schema.create()

In [482]:
connection.execute("SELECT * FROM anime").fetchone()

(1, 'Cowboy Bebop', 'カウボーイビバップ')

In [471]:
b = anime_table_schema.insert()

In [484]:
df.columns

Index(['id', 'title_english', 'title_japanese', 'synopsys', 'score',
       'rating_count', 'rank', 'popularity', 'type', 'episodes', 'rating',
       'duration', 'studio', 'season', 'year', 'genres', 'coarse_genre',
       'all_genres'],
      dtype='object')

In [481]:
with connection.begin(): # open a transaction - this runs in the
    for _, row in df.loc[:, ["id", "title_english", "title_japanese"]].iterrows():
        connection.execute(anime_table_schema.insert(), row.to_dict())

In [485]:
with connection.begin(): # open a transaction - this runs in the
    for _, row in df.loc[:, ["id", 'score', 'rating_count', 'rank', 'popularity']].iterrows():
        connection.execute(ranks_table_schema.insert(), row.to_dict())

In [481]:
with connection.begin(): # open a transaction - this runs in the
    for _, row in df.loc[:, ["id", "title_english", "title_japanese"]].iterrows():
        connection.execute(anime_table_schema.insert(), row.to_dict())

In [481]:
with connection.begin(): # open a transaction - this runs in the
    for _, row in df.loc[:, ["id", "title_english", "title_japanese"]].iterrows():
        connection.execute(anime_table_schema.insert(), row.to_dict())

In [481]:
with connection.begin(): # open a transaction - this runs in the
    for _, row in df.loc[:, ["id", "title_english", "title_japanese"]].iterrows():
        connection.execute(anime_table_schema.insert(), row.to_dict())

In [434]:
encoder = LabelEncoder()

subset = list(anime_table_schema.keys())
df.loc[:, subset].to_sql("anime", engine, if_exists="replace", index=False, dtype=anime_table_schema, )

subset = list(ranks_table_schema.keys())
df.loc[:, subset].to_sql("ranks", engine, if_exists="replace", index=False, dtype=ranks_table_schema)


coarse_genre_data = pd.DataFrame({"id": encoder.fit_transform(df.coarse_genre),
                                  "coarse_genre": df.coarse_genre})

coarse_genre_data.drop_duplicates().to_sql("coarse_genres", engine, if_exists="replace",
                                           index=False, dtype=coarse_genre_table_schema)

all_genres_data = pd.DataFrame({"id": encoder.fit_transform(df.all_genres),
                                "all_genres": df.all_genres})
all_genres_data.drop_duplicates().to_sql("all_genres", engine, if_exists="replace",
                                         index=False, dtype=all_genres_table_schema)

df.coarse_genre = encoder.fit_transform(df.coarse_genre)
df.all_genres = encoder.fit_transform(df.all_genres)

subset = list(description_table_schema.keys())
df.loc[:, subset].to_sql("descriptions", engine, if_exists="replace", index=False, dtype=description_table_schema)

AttributeError: 'Table' object has no attribute 'keys'

In [376]:
cursor = connection.execute("SELECT * FROM anime JOIN ranks")

In [379]:
cursor.fetchmany(2)
cursor.close()

In [399]:
connection.execute("DROP TABLE anime")
connection.execute("DROP TABLE ranks")
connection.execute("DROP TABLE descriptions")
connection.execute("DROP TABLE coarse_genres")
connection.execute("DROP TABLE all_genres")
connection.commit()

OperationalError: no such table: anime

In [404]:
connection.close()

In [414]:
connection = engine.connect()

        

In [452]:
with connection.begin():
    r1 = connection.execute(anime_table_schema.create())

OperationalError: (sqlite3.OperationalError) table anime already exists
[SQL: 
CREATE TABLE anime (
	id INTEGER NOT NULL, 
	title_english VARCHAR(100), 
	title_japanese VARCHAR(100), 
	PRIMARY KEY (id), 
	FOREIGN KEY(id) REFERENCES descriptions (id), 
	FOREIGN KEY(id) REFERENCES ranks (id)
)

]
(Background on this error at: http://sqlalche.me/e/13/e3q8)