In [315]:
import warnings, json
import pandas as pd
from tabulate import tabulate

warnings.filterwarnings("ignore")

# Load and extract Data

## Data source I

In [316]:
import urllib.request

source_I_url = "https://raw.githubusercontent.com/rahilpacmann/case-data-wrangling-api/main/city.csv"

with urllib.request.urlopen(source_I_url) as response:
    df_1 = pd.read_csv(response)
df_1.head(5)

Unnamed: 0,city_id,city,country
0,1,A Corua (La Corua),Spain
1,2,Abha,Saudi Arabia
2,3,Abu Dhabi,United Arab Emirates
3,4,Acua,Mexico
4,5,Adana,Turkey


## Data source II

In [317]:
source_II_url = "https://raw.githubusercontent.com/rahilpacmann/case-data-wrangling-api/main/country.csv"

with urllib.request.urlopen(source_II_url) as response:
    df_2 = pd.read_csv(response)
df_2.head(5)

Unnamed: 0,country,last_update
0,Afghanistan,2006-02-15 09:44:00
1,Algeria,2006-02-15 09:44:00
2,American Samoa,2006-02-15 09:44:00
3,Angola,2006-02-15 09:44:00
4,Anguilla,2006-02-15 09:44:00


## Data source III

In [318]:
from sqlalchemy import create_engine, Engine
from pandas import DataFrame

def source_postgres_engine(database_name: str):
    # connection to database
    user = "postgres"
    password = "qwerty123"
    host = "localhost"
    port = "5433"

    engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database_name}")
    
    return engine

def get_table_data(table_name: str, engine: Engine) -> DataFrame:
    query = f"SELECT * FROM {table_name}"
    return pd.read_sql(query, engine)

In [319]:
engine = source_postgres_engine("dvdrental")

actor_df = get_table_data("actor", engine)
store_df = get_table_data('store', engine)
address_df = get_table_data('address', engine)
category_df = get_table_data('category', engine)
customer_df = get_table_data('customer', engine)
film_actor_df = get_table_data('film_actor', engine)
film_category_df = get_table_data('film_category', engine)
inventory_df = get_table_data('inventory',engine)
language_df = get_table_data('language',engine)
rental_df = get_table_data('rental',engine)
staff_df = get_table_data('staff',engine)
payment_df = get_table_data('payment',engine)
film_df = get_table_data('film',engine)

In [320]:
data_dict = {
    "actor": actor_df,
    "store": store_df,
    "address": address_df,
    "category": category_df,
    "customer": customer_df,
    "film_actor": film_actor_df,
    "film_category": film_category_df,
    "inventory": inventory_df,
    "language": language_df,
    "rental": rental_df,
    "staff": staff_df,
    "payment": payment_df,
    "film": film_df,
    "city": df_1,
    "country": df_2    
}

# Data Validation

In [321]:
requirement_url = "https://rahilpacmann.github.io/case-data-wrangling-api/requirements_table.json"

with urllib.request.urlopen(requirement_url) as response:
    requirement_json = json.load(response)
print(requirement_json)

{'actor': [{'column_name': 'actor_id', 'data_type': 'int64'}, {'column_name': 'last_update', 'data_type': 'datetime64[ns]'}, {'column_name': 'first_name', 'data_type': 'object'}, {'column_name': 'last_name', 'data_type': 'object'}], 'store': [{'column_name': 'store_id', 'data_type': 'int64'}, {'column_name': 'manager_staff_id', 'data_type': 'int64'}, {'column_name': 'address_id', 'data_type': 'int64'}, {'column_name': 'last_update', 'data_type': 'datetime64[ns]'}], 'address': [{'column_name': 'last_update', 'data_type': 'datetime64[ns]'}, {'column_name': 'city_id', 'data_type': 'int64'}, {'column_name': 'address_id', 'data_type': 'int64'}, {'column_name': 'district', 'data_type': 'object'}, {'column_name': 'phone', 'data_type': 'object'}, {'column_name': 'postal_code', 'data_type': 'object'}, {'column_name': 'address', 'data_type': 'object'}, {'column_name': 'address2', 'data_type': 'object'}], 'category': [{'column_name': 'category_id', 'data_type': 'int64'}, {'column_name': 'last_upd

In [322]:
actual_table_name = list(requirement_json.keys())
print(actual_table_name)

['actor', 'store', 'address', 'category', 'city', 'country', 'customer', 'film_actor', 'film_category', 'inventory', 'language', 'rental', 'staff', 'payment', 'film']


In [323]:
def check_table_requirements(actual_table: dict, requirement_table: dict):
    actual_table_name = list(actual_table.keys())
    requirement_table_name = list(requirement_table.keys())

    table_checking = []
    for table_name in requirement_table_name:
        if table_name in actual_table_name:
            table_checking.append([table_name, "✓"])
        else:
            table_checking.append([table_name, "✗"])
    
    table_headers = ["Table name", "Is exist"]
    table = tabulate(table_checking, headers=table_headers, tablefmt="grid")
    print("=> STEP 1: CHECK TABLE")
    print(table)

check_table_requirements(data_dict, requirement_json)

=> STEP 1: CHECK TABLE
+---------------+------------+
| Table name    | Is exist   |
| actor         | ✓          |
+---------------+------------+
| store         | ✓          |
+---------------+------------+
| address       | ✓          |
+---------------+------------+
| category      | ✓          |
+---------------+------------+
| city          | ✓          |
+---------------+------------+
| country       | ✓          |
+---------------+------------+
| customer      | ✓          |
+---------------+------------+
| film_actor    | ✓          |
+---------------+------------+
| film_category | ✓          |
+---------------+------------+
| inventory     | ✓          |
+---------------+------------+
| language      | ✓          |
+---------------+------------+
| rental        | ✓          |
+---------------+------------+
| staff         | ✓          |
+---------------+------------+
| payment       | ✓          |
+---------------+------------+
| film          | ✓          |
+---------------

In [324]:
def check_data_shape(actual_table: dict):
    table_shape = []

    for table_name in actual_table:
        rows, cols = actual_table[table_name].shape
        table_shape.append([table_name, rows, cols])
    
    table_headers = ["Table name", "Number of rows", "Number of columns"]
    table = tabulate(table_shape, headers=table_headers, tablefmt="grid")
    print("=> STEP 2: CHECK TABLE SHAPE")
    print(table)
    
check_data_shape(data_dict)

=> STEP 2: CHECK TABLE SHAPE
+---------------+------------------+---------------------+
| Table name    |   Number of rows |   Number of columns |
| actor         |              200 |                   4 |
+---------------+------------------+---------------------+
| store         |                2 |                   4 |
+---------------+------------------+---------------------+
| address       |              603 |                   8 |
+---------------+------------------+---------------------+
| category      |               16 |                   3 |
+---------------+------------------+---------------------+
| customer      |              599 |                  10 |
+---------------+------------------+---------------------+
| film_actor    |             5462 |                   3 |
+---------------+------------------+---------------------+
| film_category |             1000 |                   3 |
+---------------+------------------+---------------------+
| inventory     |          

In [325]:
def check_columns(actual_table: dict, requirement_table: dict):
    print("=> STEP 3: CHECK COLUMNS")

    for table_name in requirement_table:
        result = []
        actual_columns = list(actual_table[table_name].columns)
        requirement_columns = []

        for data in requirement_table[table_name]:
            requirement_columns.append(data["column_name"])
        
        for column_name in set(actual_columns + requirement_columns):
            in_actual_table = "✔" if column_name in actual_columns else "✘"
            in_requirement_table = "✔" if column_name in requirement_columns else "✘"
            result.append([column_name, in_actual_table, in_requirement_table])

        if set(actual_columns) == set(requirement_columns):
            pass
        else:
            print(table_name)
            table_headers = ["Column name", "In actual table", "In requirement table"]
            table = tabulate(result, headers=table_headers, tablefmt="grid")
            print(table)
            print("\n")
            
check_columns(data_dict, requirement_json)

=> STEP 3: CHECK COLUMNS
city
+---------------+-------------------+------------------------+
| Column name   | In actual table   | In requirement table   |
| country       | ✔                 | ✘                      |
+---------------+-------------------+------------------------+
| city_id       | ✔                 | ✔                      |
+---------------+-------------------+------------------------+
| country_id    | ✘                 | ✔                      |
+---------------+-------------------+------------------------+
| last_update   | ✘                 | ✔                      |
+---------------+-------------------+------------------------+
| city          | ✔                 | ✔                      |
+---------------+-------------------+------------------------+


country
+---------------+-------------------+------------------------+
| Column name   | In actual table   | In requirement table   |
| country       | ✔                 | ✔                      |
+--------------

In [326]:
def check_data_type(actual_table: dict, requirement_table: dict):
    result = []

    for table_name, df in actual_table.items():
        if table_name in requirement_table:
            for info_table in requirement_table[table_name]:
                column_name = info_table["column_name"] 
                data_type_req = info_table["data_type"]
                if column_name in df.columns:
                    data_type_actual = df[column_name].dtype
                    result_data_type = "✔" if data_type_req == data_type_actual else "✘"
                    result.append([table_name, column_name, data_type_actual, 
                                   data_type_req, result_data_type])
                else:
                    result.append([table_name, column_name, "N/A", 
                                   data_type_req, "✘ (Column not found)"])
    
    print("=> STEP 4: CHECK DATA TYPE")
    table_headers = ["Table name", "Column name", "Actual type", "Requirement type", "Match"]
    
    missmatch_data = [row for row in result if "✘" in row[4]] 
    if missmatch_data:
        print("\nSummary of Mismatches Data Types:")
        table = tabulate(missmatch_data, headers=table_headers, tablefmt="grid")
        print(table)
    else:
        print("All data types match")

check_data_type(data_dict, requirement_json)

=> STEP 4: CHECK DATA TYPE

Summary of Mismatches Data Types:
+--------------+---------------+---------------+--------------------+----------------------+
| Table name   | Column name   | Actual type   | Requirement type   | Match                |
| customer     | create_date   | object        | datetime64[ns]     | ✘                    |
+--------------+---------------+---------------+--------------------+----------------------+
| city         | country_id    | N/A           | int64              | ✘ (Column not found) |
+--------------+---------------+---------------+--------------------+----------------------+
| city         | last_update   | N/A           | datetime64[ns]     | ✘ (Column not found) |
+--------------+---------------+---------------+--------------------+----------------------+
| country      | country_id    | N/A           | int64              | ✘ (Column not found) |
+--------------+---------------+---------------+--------------------+----------------------+
| countr

In [327]:
def check_missing_values(actual_table: dict):
    result = []

    for table_name, df in actual_table.items():
        missing_count = df.isna().sum()
        total_data = df.shape[0]
        
        for column, missing in missing_count.items():
            missing_count_percantage = round((missing / total_data * 100), 2)
            result.append([table_name, column, missing, missing_count_percantage])
    
    table_headers = ["Table name", "Column name", "Missing value count", 
                     "Missing value percentage"]
    missing_value = [row for row in result if row[2] != 0]
    if missing_value:
        table = tabulate(missing_value, headers=table_headers, tablefmt="grid")
        print("=> STEP 5: CHECK MISSING VALUE")
        print(table)
    else:
        print("There's no Missing Values")

check_missing_values(data_dict)

=> STEP 5: CHECK MISSING VALUE
+--------------+---------------+-----------------------+----------------------------+
| Table name   | Column name   |   Missing value count |   Missing value percentage |
| address      | address2      |                     4 |                       0.66 |
+--------------+---------------+-----------------------+----------------------------+
| rental       | return_date   |                   183 |                       1.14 |
+--------------+---------------+-----------------------+----------------------------+
| staff        | picture       |                     1 |                      50    |
+--------------+---------------+-----------------------+----------------------------+
| city         | city          |                    10 |                       1.48 |
+--------------+---------------+-----------------------+----------------------------+
| city         | country       |                     7 |                       1.03 |
+--------------+-------

In [328]:
def check_duplicates_data(actual_table: dict):
    result = []

    for table_name, df in actual_table.items():
        # duplicate_rows = df[df.duplicated(keep=False)]
        duplicate_rows = df.astype(str).duplicated(keep=False).sum()
        result.append([table_name, duplicate_rows])
    
    duplicate_data = [row for row in result if row[1] != 0]
    if duplicate_data:
        table_headers = ["Table name", "Duplicate rows count"]
        duplicates_data = [row for row in result if row[1] != 0]
        table = tabulate(duplicates_data, headers=table_headers, tablefmt="grid")
        print("=> STEP 6: CHECK DUPLICATES DATA")
        print("Duplicate Data Summary:")
        print(table)
    else:   
        print("No Duplicate Data Found")

check_duplicates_data(data_dict)

=> STEP 6: CHECK DUPLICATES DATA
Duplicate Data Summary:
+--------------+------------------------+
| Table name   |   Duplicate rows count |
| city         |                    154 |
+--------------+------------------------+


# Data Transform

## Data Cleansing
- Berdasarkan **hasil validations** diketahui bahwa data yang diperoleh dari beberapa data source **masih belum clean.**
- **Kolom yang tidak sesuai requirements terdapat pada tabel:** 
    - city
    - country
- **Terdapat missing values pada kolom :**
    - address
    - city
    - rental
    - staff
- **Tipe data yang tidak sesuai requirements terdapat pada tabel :**
    - city
    - country
    - customer
- **Terdapat duplicates data pada kolom :**
    - city

### Handle mismatch columns

**Table Country**

In [329]:
# table actual
country_table = data_dict["country"]
country_table.head(5)

Unnamed: 0,country,last_update
0,Afghanistan,2006-02-15 09:44:00
1,Algeria,2006-02-15 09:44:00
2,American Samoa,2006-02-15 09:44:00
3,Angola,2006-02-15 09:44:00
4,Anguilla,2006-02-15 09:44:00


In [330]:
# table requirement
requirement_json["country"]

[{'column_name': 'country_id', 'data_type': 'int64'},
 {'column_name': 'last_update', 'data_type': 'datetime64[ns]'},
 {'column_name': 'country', 'data_type': 'object'}]

In [331]:
country_table["country_id"] = range(1, len(country_table) + 1) 
country_table = country_table[["country_id", "country", "last_update"]]
country_table.head(5)

Unnamed: 0,country_id,country,last_update
0,1,Afghanistan,2006-02-15 09:44:00
1,2,Algeria,2006-02-15 09:44:00
2,3,American Samoa,2006-02-15 09:44:00
3,4,Angola,2006-02-15 09:44:00
4,5,Anguilla,2006-02-15 09:44:00


In [332]:
# menambahkan dataframe yang telah diperbarui ke data_dict
data_dict["country"] = country_table

**Table City**

In [333]:
# table actual
city_table = data_dict["city"]
city_table.head(5)

Unnamed: 0,city_id,city,country
0,1,A Corua (La Corua),Spain
1,2,Abha,Saudi Arabia
2,3,Abu Dhabi,United Arab Emirates
3,4,Acua,Mexico
4,5,Adana,Turkey


In [334]:
# table requirement
requirement_json["city"]

[{'column_name': 'city_id', 'data_type': 'int64'},
 {'column_name': 'country_id', 'data_type': 'int64'},
 {'column_name': 'last_update', 'data_type': 'datetime64[ns]'},
 {'column_name': 'city', 'data_type': 'object'}]

In [335]:
country_table = data_dict["country"]
city_merge = city_table.merge(
    country_table,
    how="inner",
    on="country"
)
city_merge = city_merge[["city_id", "country_id", "city", "last_update"]]
city_merge.head()

Unnamed: 0,city_id,country_id,city,last_update
0,1,87,A Corua (La Corua),2006-02-15 09:44:00
1,2,82,Abha,2006-02-15 09:44:00
2,3,101,Abu Dhabi,2006-02-15 09:44:00
3,4,60,Acua,2006-02-15 09:44:00
4,5,97,Adana,2006-02-15 09:44:00


In [336]:
data_dict["city"] = city_merge

In [337]:
check_columns(data_dict, requirement_json)

=> STEP 3: CHECK COLUMNS


### Handle missing values

In [338]:
def remove_missing_values(actual_table: dict) -> dict:
    clean_actual_table = {}

    for table_name, df in actual_table.items():
        df = df.dropna()
        clean_actual_table[table_name] = df
    
    return clean_actual_table

data_clean = remove_missing_values(data_dict)
check_missing_values(data_clean)

There's no Missing Values


### Handle Data Types

In [339]:
def adjust_data_type(actual_table: dict, requirement_table: dict) -> dict:
    data_clean = {}

    for table_name, df in actual_table.items():
        requirment_type = {}
        for requirment_column in requirement_table[table_name]:
            column = requirment_column["column_name"]
            type_data = requirment_column["data_type"]
            requirment_type[column] = type_data

        for column_name in df.columns:
            df[column_name] = df[column_name].astype(requirment_type[column_name])

        data_clean[table_name] = df
    
    return data_clean

data_clean = adjust_data_type(data_clean, requirement_json)
check_data_type(data_clean, requirement_json)
        

=> STEP 4: CHECK DATA TYPE
All data types match


### Handle Duplicates Data

In [340]:
def remove_duplicates(actual_table: dict) -> dict:
    data_clean = {}

    for table_name, df in actual_table.items():
        df = df.astype(str).drop_duplicates(keep="first")
        # duplicate_rows = df.astype(str).duplicated(keep=False).sum()
        data_clean[table_name] = df
    return data_clean

data_clean = remove_duplicates(data_clean)
check_duplicates_data(data_clean)

No Duplicate Data Found


## Data Manipulation & Data Selection

In [341]:
film_category_df = data_clean["film_category"]
category_df = data_clean["category"]

film_list = category_df.merge(
    film_category_df,
    how="left",
    on="category_id",
    suffixes=("_x1", "_y1")
)

film_list.head()

Unnamed: 0,category_id,name,last_update_x1,film_id,last_update_y1
0,1,Action,2006-02-15 09:46:27,19,2006-02-15 10:07:09
1,1,Action,2006-02-15 09:46:27,21,2006-02-15 10:07:09
2,1,Action,2006-02-15 09:46:27,29,2006-02-15 10:07:09
3,1,Action,2006-02-15 09:46:27,38,2006-02-15 10:07:09
4,1,Action,2006-02-15 09:46:27,56,2006-02-15 10:07:09


In [342]:
film_df = data_clean["film"]

film_list = film_list.merge(
    film_df,
    how="left",
    on="film_id",
    suffixes=("_x2", "_y2")
)

film_list.head()

Unnamed: 0,category_id,name,last_update_x1,film_id,last_update_y1,title,description,release_year,language_id,rental_duration,rental_rate,length,replacement_cost,rating,last_update,special_features,fulltext
0,1,Action,2006-02-15 09:46:27,19,2006-02-15 10:07:09,Amadeus Holy,A Emotional Display of a Pioneer And a Technic...,2006,1,6,0.99,113,20.99,PG,2013-05-26 14:50:58.951,"['Commentaries', 'Deleted Scenes', 'Behind the...",'amadeus':1 'baloon':20 'battl':15 'display':5...
1,1,Action,2006-02-15 09:46:27,21,2006-02-15 10:07:09,American Circus,A Insightful Drama of a Girl And a Astronaut w...,2006,1,3,4.99,129,17.99,R,2013-05-26 14:50:58.951,"['Commentaries', 'Behind the Scenes']",'administr':17 'american':1 'astronaut':11 'ci...
2,1,Action,2006-02-15 09:46:27,29,2006-02-15 10:07:09,Antitrust Tomatoes,A Fateful Yarn of a Womanizer And a Feminist w...,2006,1,5,2.99,168,11.99,NC-17,2013-05-26 14:50:58.951,"['Trailers', 'Commentaries', 'Deleted Scenes']",'administr':17 'ancient':19 'antitrust':1 'dat...
3,1,Action,2006-02-15 09:46:27,38,2006-02-15 10:07:09,Ark Ridgemont,A Beautiful Yarn of a Pioneer And a Monkey who...,2006,1,6,0.99,68,25.99,NC-17,2013-05-26 14:50:58.951,"['Trailers', 'Commentaries', 'Deleted Scenes',...",'ark':1 'beauti':4 'desert':20 'explor':16 'mo...
4,1,Action,2006-02-15 09:46:27,56,2006-02-15 10:07:09,Barefoot Manchurian,A Intrepid Story of a Cat And a Student who mu...,2006,1,6,2.99,129,15.99,G,2013-05-26 14:50:58.951,"['Trailers', 'Commentaries']",'abandon':19 'amus':20 'barefoot':1 'cat':8 'g...


In [343]:
film_actor_df = data_clean["film_actor"]

film_list = film_list.merge(
    film_actor_df,
    how="inner",
    on="film_id",
    suffixes=("_x3", "_y3")
)
film_list.head()

Unnamed: 0,category_id,name,last_update_x1,film_id,last_update_y1,title,description,release_year,language_id,rental_duration,rental_rate,length,replacement_cost,rating,last_update_x3,special_features,fulltext,actor_id,last_update_y3
0,1,Action,2006-02-15 09:46:27,19,2006-02-15 10:07:09,Amadeus Holy,A Emotional Display of a Pioneer And a Technic...,2006,1,6,0.99,113,20.99,PG,2013-05-26 14:50:58.951,"['Commentaries', 'Deleted Scenes', 'Behind the...",'amadeus':1 'baloon':20 'battl':15 'display':5...,5,2006-02-15 10:05:03
1,1,Action,2006-02-15 09:46:27,19,2006-02-15 10:07:09,Amadeus Holy,A Emotional Display of a Pioneer And a Technic...,2006,1,6,0.99,113,20.99,PG,2013-05-26 14:50:58.951,"['Commentaries', 'Deleted Scenes', 'Behind the...",'amadeus':1 'baloon':20 'battl':15 'display':5...,27,2006-02-15 10:05:03
2,1,Action,2006-02-15 09:46:27,19,2006-02-15 10:07:09,Amadeus Holy,A Emotional Display of a Pioneer And a Technic...,2006,1,6,0.99,113,20.99,PG,2013-05-26 14:50:58.951,"['Commentaries', 'Deleted Scenes', 'Behind the...",'amadeus':1 'baloon':20 'battl':15 'display':5...,37,2006-02-15 10:05:03
3,1,Action,2006-02-15 09:46:27,19,2006-02-15 10:07:09,Amadeus Holy,A Emotional Display of a Pioneer And a Technic...,2006,1,6,0.99,113,20.99,PG,2013-05-26 14:50:58.951,"['Commentaries', 'Deleted Scenes', 'Behind the...",'amadeus':1 'baloon':20 'battl':15 'display':5...,43,2006-02-15 10:05:03
4,1,Action,2006-02-15 09:46:27,19,2006-02-15 10:07:09,Amadeus Holy,A Emotional Display of a Pioneer And a Technic...,2006,1,6,0.99,113,20.99,PG,2013-05-26 14:50:58.951,"['Commentaries', 'Deleted Scenes', 'Behind the...",'amadeus':1 'baloon':20 'battl':15 'display':5...,84,2006-02-15 10:05:03


In [344]:
actor_df = data_clean["actor"]
actor_df["full_name"] = actor_df["first_name"] + ' ' + actor_df["last_name"]
actor_df["full_name"] = actor_df['full_name'].astype(str)

film_list = film_list.merge(
    actor_df,
    how="inner",
    on="actor_id",
    suffixes=("_x4", "_y4")
)
film_list.head()

Unnamed: 0,category_id,name,last_update_x1,film_id,last_update_y1,title,description,release_year,language_id,rental_duration,...,rating,last_update_x3,special_features,fulltext,actor_id,last_update_y3,first_name,last_name,last_update,full_name
0,1,Action,2006-02-15 09:46:27,19,2006-02-15 10:07:09,Amadeus Holy,A Emotional Display of a Pioneer And a Technic...,2006,1,6,...,PG,2013-05-26 14:50:58.951,"['Commentaries', 'Deleted Scenes', 'Behind the...",'amadeus':1 'baloon':20 'battl':15 'display':5...,5,2006-02-15 10:05:03,Johnny,Lollobrigida,2013-05-26 14:47:57.620,Johnny Lollobrigida
1,1,Action,2006-02-15 09:46:27,19,2006-02-15 10:07:09,Amadeus Holy,A Emotional Display of a Pioneer And a Technic...,2006,1,6,...,PG,2013-05-26 14:50:58.951,"['Commentaries', 'Deleted Scenes', 'Behind the...",'amadeus':1 'baloon':20 'battl':15 'display':5...,27,2006-02-15 10:05:03,Julia,Mcqueen,2013-05-26 14:47:57.620,Julia Mcqueen
2,1,Action,2006-02-15 09:46:27,19,2006-02-15 10:07:09,Amadeus Holy,A Emotional Display of a Pioneer And a Technic...,2006,1,6,...,PG,2013-05-26 14:50:58.951,"['Commentaries', 'Deleted Scenes', 'Behind the...",'amadeus':1 'baloon':20 'battl':15 'display':5...,37,2006-02-15 10:05:03,Val,Bolger,2013-05-26 14:47:57.620,Val Bolger
3,1,Action,2006-02-15 09:46:27,19,2006-02-15 10:07:09,Amadeus Holy,A Emotional Display of a Pioneer And a Technic...,2006,1,6,...,PG,2013-05-26 14:50:58.951,"['Commentaries', 'Deleted Scenes', 'Behind the...",'amadeus':1 'baloon':20 'battl':15 'display':5...,43,2006-02-15 10:05:03,Kirk,Jovovich,2013-05-26 14:47:57.620,Kirk Jovovich
4,1,Action,2006-02-15 09:46:27,19,2006-02-15 10:07:09,Amadeus Holy,A Emotional Display of a Pioneer And a Technic...,2006,1,6,...,PG,2013-05-26 14:50:58.951,"['Commentaries', 'Deleted Scenes', 'Behind the...",'amadeus':1 'baloon':20 'battl':15 'display':5...,84,2006-02-15 10:05:03,James,Pitt,2013-05-26 14:47:57.620,James Pitt


In [345]:
film_list = film_list.groupby(['film_id', 'title', 'description', 'name', 'rental_rate', 'length', 'rating'])['full_name'].apply(lambda x: ', '.join(x))
film_list.head()

film_id  title                description                                                                                                      name         rental_rate  length  rating
1        Academy Dinosaur     A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies                 Documentary  0.99         86      PG        Penelope Guiness, Christian Gable, Lucille Tra...
10       Aladdin Calendar     A Action-Packed Tale of a Man And a Lumberjack who must Reach a Feminist in Ancient China                        Sports       4.99         63      NC-17     Alec Wayne, Judy Dean, Val Bolger, Ray Johanss...
100      Brooklyn Desert      A Beautiful Drama of a Dentist And a Composer who must Battle a Sumo Wrestler in The First Manned Space Station  Foreign      4.99         161     R         Jodie Degeneres, Jayne Neeson, Sean Guiness, A...
1000     Zorro Ark            A Intrepid Panorama of a Mad Scientist And a Boy who must Redeem a Boy in A

In [346]:
type(film_list)

pandas.core.series.Series

In [347]:
film_list = pd.DataFrame(film_list)
film_list = film_list.reset_index()

film_list.head()

Unnamed: 0,film_id,title,description,name,rental_rate,length,rating,full_name
0,1,Academy Dinosaur,A Epic Drama of a Feminist And a Mad Scientist...,Documentary,0.99,86,PG,"Penelope Guiness, Christian Gable, Lucille Tra..."
1,10,Aladdin Calendar,A Action-Packed Tale of a Man And a Lumberjack...,Sports,4.99,63,NC-17,"Alec Wayne, Judy Dean, Val Bolger, Ray Johanss..."
2,100,Brooklyn Desert,A Beautiful Drama of a Dentist And a Composer ...,Foreign,4.99,161,R,"Jodie Degeneres, Jayne Neeson, Sean Guiness, A..."
3,1000,Zorro Ark,A Intrepid Panorama of a Mad Scientist And a B...,Comedy,4.99,50,NC-17,"Ian Tandy, Nick Degeneres, Lisa Monroe"
4,101,Brotherhood Blanket,A Fateful Character Study of a Butler And a Te...,Documentary,0.99,73,R,"Fred Costner, Frances Day-Lewis, Jude Cruise, ..."


In [348]:
rename_colom_map = {
    'film_id': 'fid',
    'name': 'category',
    'rental_rate': 'prince',
    'full_name': 'actors'
}

film_list.rename(columns=rename_colom_map, inplace=True)
film_list.head()


Unnamed: 0,fid,title,description,category,prince,length,rating,actors
0,1,Academy Dinosaur,A Epic Drama of a Feminist And a Mad Scientist...,Documentary,0.99,86,PG,"Penelope Guiness, Christian Gable, Lucille Tra..."
1,10,Aladdin Calendar,A Action-Packed Tale of a Man And a Lumberjack...,Sports,4.99,63,NC-17,"Alec Wayne, Judy Dean, Val Bolger, Ray Johanss..."
2,100,Brooklyn Desert,A Beautiful Drama of a Dentist And a Composer ...,Foreign,4.99,161,R,"Jodie Degeneres, Jayne Neeson, Sean Guiness, A..."
3,1000,Zorro Ark,A Intrepid Panorama of a Mad Scientist And a B...,Comedy,4.99,50,NC-17,"Ian Tandy, Nick Degeneres, Lisa Monroe"
4,101,Brotherhood Blanket,A Fateful Character Study of a Butler And a Te...,Documentary,0.99,73,R,"Fred Costner, Frances Day-Lewis, Jude Cruise, ..."
