In [1]:
import pandas as pd
import numpy as np

In [2]:
INPUT_CSV_FILE_NAME = 'extracted_movies.csv'
MOVIES_DETAILS_CSV_NAME = 'movies_details.csv'
MOVIES_CAST_CSV_NAME = 'movies_cast.csv'
MOVIES_REVIEWS_CSV_NAME = 'movies_reviews.csv'
END_REVIEW_TOKEN = '$&&&&$'

In [3]:
movies = pd.read_csv(INPUT_CSV_FILE_NAME)
movies_details = movies[['Title', 'Year', 'Director', 'Movie summary']].copy()

In [4]:
movies_details.loc[:,'Title'] = movies_details.loc[:,'Title'].str.strip()
# oczyszczanie kolumny 'Title' z niepotrzebnych białych znaków

In [5]:
movies_details.loc[:,'Year'] = movies_details.loc[:,'Year'].str.extract('(\d+)').astype(int)
# wyciągnięcie numeru z kolumny 'Year' i zmiana na int

In [6]:
movies_details.loc[:,'Director'] = movies_details.loc[:,'Director'].str.replace("\nDirector:\n", "").str.replace("\nDirectors:\n", "")
# wyciągnięcie z kolumny 'Title' imię i nazwisko reżysera

In [7]:
movies_details

Unnamed: 0,Title,Year,Director,Movie summary
0,The Irishman,2019,Martin Scorsese,A mob hitman recalls his possible involvement ...
1,Frozen II,2019,"Chris Buck, Jennifer Lee","Anna, Elsa, Kristoff, Olaf and Sven leave Aren..."
2,Knives Out,2019,Rian Johnson,A detective investigates the death of a patria...


In [8]:
movies_details.to_csv(MOVIES_DETAILS_CSV_NAME, index=False)

In [9]:
#####################################################################################

In [10]:
movies_reviews = movies.filter(['Title', 'Reviews'], axis=1)

In [11]:
def format_reviews(reviews_string):
    '''Formating reviews string and separating them to list elements.'''
    END_REVIEW_TOKEN = '$&&&&$'
    reviews_list = list(reviews_string.replace("[", "")
                                      .replace("]", "")
                                      .replace("\'", "")
                                      .replace("\\", "")
                                      .replace("\\t", "")
                                      .replace("\\s", "")
                                      .replace("\\m", "")
                                      .replace(', "', " ")
                                      .strip()
                                      .replace(END_REVIEW_TOKEN + '"', END_REVIEW_TOKEN)
                                      .split(END_REVIEW_TOKEN))
     
    del reviews_list[-1]
    return reviews_list

In [12]:
movies_reviews.loc[:,'Reviews'] = movies_reviews.loc[:,'Reviews'].apply(format_reviews)

In [15]:
def create_row_for_each_list_element(df, list_column_to_split):
    '''Creating row in dataframe for each list element.'''

    extended_df = pd.DataFrame({
                        col:np.repeat(df[col].values, df[list_column_to_split].str.len())
                        for col in df.columns.drop(list_column_to_split)
                        }).assign(**{list_column_to_split:np.concatenate(df[list_column_to_split].values)})[df.columns]
    return extended_df

In [16]:
movies_reviews = create_row_for_each_list_element(movies_reviews, 'Reviews')

In [17]:
movies_reviews.to_csv(MOVIES_REVIEWS_CSV_NAME, index=False)

In [18]:
###########################################################################################

def from_string_to_dict_on_cast(cast_string):
    '''Modifing cast string and saving results to dictionary'''
    #przetworzenie stringu na listę aktorzy i role w jednej tabeli
    #aktorzy nieparzyści, role parzyste
    cast_string = list(cast_string.replace("\n", "")
                              .strip()
                              .replace("\\n", "")
                              .replace("'", "").replace("[  ", "")
                              .replace("]", "")
                              .replace("               ...          ", ",   ")
                              .split(",   "))
    #przekonwertowanie listy na słownik aktor:rola
    cast_dict = dict(zip(cast_string[::2], cast_string[1::2]))
    return cast_dict


In [19]:
movies_cast = movies.filter(['Title', 'Cast'], axis=1)

In [20]:
def from_string_to_list_on_cast(cast_string):
    '''Modifing cast string and saving results to dictionary'''
    #przetworzenie stringu na listę aktorzy i role w jednej tabeli
    #aktorzy nieparzyści, role parzyste
    cast_string = list(cast_string.replace("\n", "")
                              .strip()
                              .replace("\\n", "")
                              .replace("'", "").replace("[  ", "")
                              .replace("]", "")
                              .replace("               ...          ", ": ")
                              .split(",   "))
    #przekonwertowanie listy na słownik aktor:rola
    cast_dict = cast_string[::2] + cast_string[1::2]
    return cast_dict


In [21]:
movies_cast.loc[:,'Cast'] = movies_cast.loc[:,'Cast'].apply(from_string_to_list_on_cast)

In [22]:
movies_cast

Unnamed: 0,Title,Cast
0,The Irishman,"[Robert De Niro: Frank Sheeran, Joe Pesci: Rus..."
1,Frozen II,[Kristen Bell: Anna (voice) ...
2,Knives Out,"[Daniel Craig: Benoit Blanc, Ana de Armas: Mar..."


In [23]:
movies_cast = create_row_for_each_list_element(movies_cast, 'Cast')

In [26]:
movies_cast.to_csv(MOVIES_CAST_CSV_NAME, index=False)