# MERGE DATASET - API DATA

### Libreries used

In [33]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sqlalchemy import create_engine
import psycopg2
import sys
import os

## Connection to database

In [34]:
sys.path.append('../otherconfig/')
from dbconfig import configuration

config = configuration('../otherconfig/database.ini')
db_url = f"postgresql+psycopg2://{config['user']}:{config['password']}@{config['host']}/{config['database']}"
engine = create_engine(db_url)

try:
    conn = engine.connect()
    print("Database connection successful.")
except Exception as e:
    print("Database connection failed:", e)
    sys.exit(1)


Reading configuration from ../otherconfig/database.ini
Database connection successful.


Next, this part will be for the practical use of the dags and the follow-up of the steps.

### Load the CSV data into a DataFrame

In [35]:
metacritic_data_path = '../data/metacritic_data.csv'
try:
    metacritic_data = pd.read_csv(metacritic_data_path)
    print(f"CSV data loaded successfully with {len(metacritic_data)} records.")
except Exception as e:
    print("Failed to load CSV data:", e)
    sys.exit(1)

CSV data loaded successfully with 14055 records.


### Load data into PostgreSQL database

In [36]:
try:
    metacritic_data.to_sql('metacritic_data', conn, if_exists='replace', index=False)
    print("Data loaded into the PostgreSQL database successfully.")
except Exception as e:
    print("Failed to load data into the PostgreSQL database:", e)
    sys.exit(1)

Data loaded into the PostgreSQL database successfully.


## API DATA

#### List of CSV files

In [37]:
api_data_path = '../data/'
csv_files = [os.path.join(api_data_path, f'api_data_{i}.csv') for i in range(1, 14)]

#### Store DataFrames

In [38]:
dataframes = []
for file in csv_files:
    try:
        df = pd.read_csv(file)
        dataframes.append(df)
        print(f"Loaded {file} successfully with {len(df)} records.")
    except Exception as e:
        print(f"Failed to load {file}:", e)
        sys.exit(1)

Loaded ../data/api_data_1.csv successfully with 1000 records.
Loaded ../data/api_data_2.csv successfully with 1000 records.
Loaded ../data/api_data_3.csv successfully with 900 records.
Loaded ../data/api_data_4.csv successfully with 240 records.
Loaded ../data/api_data_5.csv successfully with 1000 records.
Loaded ../data/api_data_6.csv successfully with 1000 records.
Loaded ../data/api_data_7.csv successfully with 1000 records.
Loaded ../data/api_data_8.csv successfully with 1000 records.
Loaded ../data/api_data_9.csv successfully with 1000 records.
Loaded ../data/api_data_10.csv successfully with 1000 records.
Loaded ../data/api_data_11.csv successfully with 1000 records.
Loaded ../data/api_data_12.csv successfully with 1000 records.
Loaded ../data/api_data_13.csv successfully with 1000 records.


#### Concatenate all DataFrames into a single DataFrame

In [39]:
combined_api_data = pd.concat(dataframes, ignore_index=True)
print(f"Combined API data has {len(combined_api_data)} records.")

Combined API data has 12140 records.


In [40]:
combined_api_data.to_csv('../data/api_data.csv', index=False)
print("Combined API data saved to 'api_data.csv'")

Combined API data saved to 'api_data.csv'


## General description

In [41]:
combined_api_data.head()

Unnamed: 0,id,slug,name,released,tba,background_image,rating,rating_top,ratings,ratings_count,...,dominant_color,platforms,parent_platforms,genres,stores,clip,tags,esrb_rating,short_screenshots,community_rating
0,9907,pictopix,Pictopix,2017-01-05,False,https://media.rawg.io/media/screenshots/ff0/ff...,3.96,4,"[{'id': 4, 'title': 'recommended', 'count': 20...",24,...,0f0f0f,"[{'platform': {'id': 4, 'name': 'PC', 'slug': ...","[{'platform': {'id': 1, 'name': 'PC', 'slug': ...","[{'id': 40, 'name': 'Casual', 'slug': 'casual'...","[{'id': 10805, 'store': {'id': 1, 'name': 'Ste...",,"[{'id': 31, 'name': 'Singleplayer', 'slug': 's...",,"[{'id': -1, 'image': 'https://media.rawg.io/me...",
1,4119,chronovolt,Chronovolt,2012-11-21,False,https://media.rawg.io/media/games/8dc/8dc50d62...,2.55,3,"[{'id': 3, 'title': 'meh', 'count': 11, 'perce...",20,...,0f0f0f,"[{'platform': {'id': 19, 'name': 'PS Vita', 's...","[{'platform': {'id': 2, 'name': 'PlayStation',...","[{'id': 4, 'name': 'Action', 'slug': 'action',...","[{'id': 4440, 'store': {'id': 3, 'name': 'Play...",,"[{'id': 37796, 'name': 'exclusive', 'slug': 'e...","{'id': 1, 'name': 'Everyone', 'slug': 'everyone'}","[{'id': -1, 'image': 'https://media.rawg.io/me...",
2,3935,breakquest-extra-evolution,BreakQuest: Extra Evolution,2012-10-16,False,https://media.rawg.io/media/screenshots/4e3/4e...,2.6,3,"[{'id': 3, 'title': 'meh', 'count': 10, 'perce...",20,...,0f0f0f,"[{'platform': {'id': 16, 'name': 'PlayStation ...","[{'platform': {'id': 2, 'name': 'PlayStation',...","[{'id': 11, 'name': 'Arcade', 'slug': 'arcade'...","[{'id': 4253, 'store': {'id': 3, 'name': 'Play...",,"[{'id': 114, 'name': 'Physics', 'slug': 'physi...","{'id': 2, 'name': 'Everyone 10+', 'slug': 'eve...","[{'id': -1, 'image': 'https://media.rawg.io/me...",
3,1991,killallzombies-2,#KILLALLZOMBIES,2014-10-28,False,https://media.rawg.io/media/screenshots/675/67...,2.36,1,"[{'id': 1, 'title': 'skip', 'count': 5, 'perce...",11,...,0f0f0f,"[{'platform': {'id': 1, 'name': 'Xbox One', 's...","[{'platform': {'id': 1, 'name': 'PC', 'slug': ...","[{'id': 4, 'name': 'Action', 'slug': 'action',...","[{'id': 2061, 'store': {'id': 2, 'name': 'Xbox...",,"[{'id': 31, 'name': 'Singleplayer', 'slug': 's...",,"[{'id': -1, 'image': 'https://media.rawg.io/me...",
4,979,tethered,Tethered,2016-10-25,False,https://media.rawg.io/media/screenshots/1e6/1e...,2.75,4,"[{'id': 4, 'title': 'recommended', 'count': 5,...",12,...,0f0f0f,"[{'platform': {'id': 18, 'name': 'PlayStation ...","[{'platform': {'id': 1, 'name': 'PC', 'slug': ...","[{'id': 3, 'name': 'Adventure', 'slug': 'adven...","[{'id': 1002, 'store': {'id': 3, 'name': 'Play...",,"[{'id': 31, 'name': 'Singleplayer', 'slug': 's...","{'id': 2, 'name': 'Everyone 10+', 'slug': 'eve...","[{'id': -1, 'image': 'https://media.rawg.io/me...",


In [42]:
combined_api_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12140 entries, 0 to 12139
Data columns (total 30 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  12140 non-null  int64  
 1   slug                12140 non-null  object 
 2   name                12140 non-null  object 
 3   released            11881 non-null  object 
 4   tba                 12140 non-null  bool   
 5   background_image    12120 non-null  object 
 6   rating              12140 non-null  float64
 7   rating_top          12140 non-null  int64  
 8   ratings             12140 non-null  object 
 9   ratings_count       12140 non-null  int64  
 10  reviews_text_count  12140 non-null  int64  
 11  added               12140 non-null  int64  
 12  added_by_status     12140 non-null  object 
 13  metacritic          4628 non-null   float64
 14  playtime            12140 non-null  int64  
 15  suggestions_count   12140 non-null  int64  
 16  upda

In [43]:
metacritic_data.head()

Unnamed: 0,Title,released,Developer,Publisher,Genres,rating,User Score,ratings_count
0,ZIGGURAT,2012-02-17,ACTION BUTTON ENTERTAINMENT,FRESHUU INC.,ACTION,,6.9,14
1,4X4 EVO 2,2001-11-15,TERMINAL REALITY,GATHERING,AUTO RACING SIM,RATED E FOR EVERYONE,0.0,0
2,MOTOGP 2,2002-01-22,NAMCO,NAMCO,AUTO RACING SIM,RATED E FOR EVERYONE,5.8,0
3,GOTHIC 3,2006-11-14,PIRANHA BYTES,ASPYR,WESTERN RPG,RATED T FOR TEEN,7.5,832
4,SIEGE SURVIVAL GLORIA VICTIS,2021-05-18,FISHTANKSTUDIO,BLACK EYE GAMES,RPG,,6.5,10


In [44]:
metacritic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14055 entries, 0 to 14054
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Title          14034 non-null  object 
 1   released       13991 non-null  object 
 2   Developer      13917 non-null  object 
 3   Publisher      13917 non-null  object 
 4   Genres         14034 non-null  object 
 5   rating         11005 non-null  object 
 6   User Score     14055 non-null  float64
 7   ratings_count  14055 non-null  int64  
dtypes: float64(1), int64(1), object(6)
memory usage: 878.6+ KB


## Transformations and merge

Genres in PostgreSQL are comma-separated strings, we will join API genres lists

In [48]:
api_relevant_columns = [
    'name', 'released', 'rating', 'ratings_count', 'metacritic', 'genres'
]
api_data_relevant = combined_api_data[api_relevant_columns].copy()

api_data_relevant['genres'] = api_data_relevant['genres'].apply(
    lambda x: ', '.join([genre['name'] for genre in eval(x)]) if pd.notnull(x) else x
)

Rename columns to match PostgreSQL data

In [49]:
api_data_relevant.rename(columns={
    'name': 'Title',
    'rating': 'Rating',
    'metacritic': 'Metacritic Score',
    'ratings_count': 'Ratings Count'
}, inplace=True)

Merge the combined API data with the PostgreSQL data

In [52]:
# Debug: Display the column names of both DataFrames
print("API Data Relevant Columns:", api_data_relevant.columns.tolist())
print("Database Data Columns:", metacritic_data.columns.tolist())

API Data Relevant Columns: ['Title', 'released', 'Rating', 'Ratings Count', 'Metacritic Score', 'genres']
Database Data Columns: ['Title', 'released', 'Developer', 'Publisher', 'Genres', 'rating', 'User Score', 'ratings_count']


In [53]:
# Merge the combined API data with the PostgreSQL data
merged_data = pd.merge(api_data_relevant, metacritic_data, on=['Title', 'released', 'genres'], how='outer')

# Display merged data
print("Merged Data:")
print(merged_data.head())

KeyError: 'genres'