In [None]:
import numpy as np 
import pandas as pd
import regex as re
import unicodedata as ud
import nltk
from nltk.stem.isri import ISRIStemmer
from nltk.tokenize import wordpunct_tokenize

## Movies Datasets

In [None]:
forties_df = pd.read_csv('1940s_data/1940s_movies.csv')
fifties_df = pd.read_csv('1950s_data/Movies_chronicled1950s.csv')
sixties_df = pd.read_csv('1960s_data/1960s_movies.csv')
seventies_df = pd.read_csv('1970s_data/1970s_movies.csv')

#### EDA functions

In [None]:
def describe_dataframe(df):
    
    for column in df.columns:
        print('Column Name: ', str(column))
        print(df[column].describe())
        print('------------------------------')

#### Cleansing functions

In [None]:
def change_empty_entries_to_NaN(df, column_name):
    
    for row_index in range(df.shape[0]):
        if '??' in str(df.loc[row_index, column_name]) or \
        df.loc[row_index, column_name] == 'nan':
            df.loc[row_index, column_name] = np.NaN
    return df
    

In [None]:
def change_empty_synopses_type(df, column_name):
    
    for row_index in range(df.shape[0]):
        cell_content = df.loc[row_index,column_name]
        if(type(cell_content)== float):
#             df.loc[row_index,column_name] = np.NaN
            print(df.loc[row_index,column_name])
#             print(type(df.loc[row_index,column_name]))
    return df

In [None]:
def unify_date_format(df):
    for row in range(df.shape[0]):
        
        release_day = df.iloc[row, 15]
        if len(str(release_day)) == 4:
            df.iloc[row,17] = release_day
            df.iloc[row,16] = 'يناير'
            df.iloc[row,15] = 1

    return df

In [None]:
def expand_release_date(df):
    # line continuation operators  '\' used for cleaner code
    df[['اليوم', 'الشهر', 'السنة']] = \
    df[' تاريخ العرض'].str.split(' ', expand = True)
    
    return df
    

In [None]:
def find_movies_with_missing_synopses(df):
    
    missing_movies = []
    for row_index in range(df.shape[0]):
        if(type(df.iloc[row_index, 3]) == float):
            missing_movies.append(df.iloc[row_index, 0])
            
    return missing_movies

In [None]:
def parse_line(line_to_parse):
    
            parsed_values = line_to_parse.split('||')
            movie_name_date = parsed_values[0].strip(')').split('(')
            
            name = movie_name_date[0]
            year = movie_name_date[1]
            df_feature = parsed_values[1]
            
            return name, year, df_feature

In [None]:
def add_scraped_features(df, file_path, column_name):
    
    with open(file_path, 'r') as filehandle:
        
        for line in filehandle:
            
            current_line = line[:-1]
            name, year, df_feature = parse_line(current_line)
            if(column_name == ' ملخص'):
               
                synopses_cell = df[column_name].loc[(df['اسم الفيلم'] == name) \
                 & (df['السنة'] == year)]
                
                try:
                    if(synopses_cell.values[0] == 'nan'):
                        df.loc[(df['اسم الفيلم'] == name) \
                     & (df['السنة'] == year), column_name] = df_feature
                except:
                    print(name)
            else:        

                df.loc[(df['اسم الفيلم'] == name) \
                     & (df['السنة'] == year), column_name] = df_feature
          
    return df

In [None]:
def filter_foregin_movies(df, file_path):
    
    with open(file_path, 'r') as filehandle:
            
        for line in filehandle:
            
            current_line = line[:-1]
            name, year, country = parse_line(current_line)
           
            if(not(('مصر') in country)):   
                movie_index = df.loc[(df['اسم الفيلم'] == name) \
                                     & (df['السنة'] == year)].index
                df.drop(inplace = True, index = movie_index)
    return df

In [None]:
def get_avg_short_movie_time(df):
    
    short_movies_durations = df[' مدة الفيلم (دقيقة)'].loc[ \
                                    df[' تصنيف الفيلم'] == 'ﻗﺼﻴﺮ']
    avg_duration = np.average(list(short_movies_durations))
    
    return avg_duration

## *_1940s Movies_*

### EDA

In [None]:
forties_df.sample(10)

In [None]:
describe_dataframe(forties_df)

In [None]:
forties_df.loc[forties_df['اسم الفيلم'] == 'خاتم سليمان']

In [None]:
forties_df[' تصنيف الفيلم'].value_counts()

### Cleansing

In [None]:
forties_df = change_empty_entries_to_NaN(forties_df, ' تصنيف الفيلم')

In [None]:
forties_df[' ملخص'] = forties_df[' ملخص'].astype(str)

In [None]:
date_expanded_df_40s = expand_release_date(forties_df)
date_expanded_df_40s = unify_date_format(date_expanded_df_40s)

In [None]:
date_expanded_df_40s = date_expanded_df_40s.drop(columns=[' تاريخ العرض'])
date_expanded_df_40s['التقييم'] = np.nan

In [None]:
foreigns_filtered_df_40s = filter_foregin_movies(date_expanded_df_40s \
                                                , '1940s_data/countries_1940s.txt')

In [None]:
genre_added_40s_df = add_scraped_features(foreigns_filtered_df_40s \
                                          , '1940s_data/1940s_genres.txt' \
                                         , ' تصنيف الفيلم')

In [None]:
genre_added_40s_df[' تصنيف الفيلم'].value_counts()

In [None]:
ratings_added_40s_df = add_scraped_features(genre_added_40s_df \
                                          , '1940s_data/1940s_ratings.txt' \
                                         , 'التقييم')

In [None]:
synopses_added_40s_df = add_scraped_features(ratings_added_40s_df \
                                          , '1940s_data/1940s_synopses.txt' \
                                         , ' ملخص')

In [None]:
synopses_added_40s_df = synopses_added_40s_df.reset_index(drop=True)

In [None]:
def check_synopsis_has_no_names(cast_list, synopsis):
    for actor in cast_list:
        if(actor in synopsis):
            synopsis = synopsis.replace(actor, '')
    return synopsis

In [None]:
def stem_synopsis(partial_df):
    
    stemmer = ISRIStemmer()
    for row_index in range(partial_df.shape[0]):
        
        synopsis = partial_df.loc[row_index, ' ملخص']
        cleaned_synopsis = ''
        for word in synopsis.split():
            cleaned_synopsis += stemmer.stem(word) + ' '
            
        partial_df.loc[row_index, ' ملخص'] = cleaned_synopsis 
        
    return partial_df

In [None]:
def preprocess_synopses(original_df, partial_df):
    for row_index in range(partial_df.shape[0]):
        
        cast = str(original_df.loc[row_index, ' تمثيل'])
        cast_list = list(cast.split('،'))
        synopsis = original_df.loc[row_index, ' ملخص']
        
        cleaned_synopsis = \
        check_synopsis_has_no_names(cast_list,synopsis )
        cleaned_synopsis = remove_arabic_characters(synopsis)
        
        partial_df.loc[row_index, ' ملخص'] = cleaned_synopsis
    
    return partial_df

In [None]:
def remove_arabic_characters(synopsis):
    cleaned_synopsis = \
    ''.join(char for char in synopsis if not ud.category(char).startswith('P'))
    
    return cleaned_synopsis

In [None]:
def remove_stop_words(partial_df):
    
    stop_words = set(nltk.corpus.stopwords.words("arabic"))
    stop_words.add('على')
    for row_index in range(partial_df.shape[0]):
        
        synopsis = partial_df.loc[row_index, ' ملخص']
        
        for word in synopsis.split():
            if word in stop_words:
                synopsis = synopsis.replace(word, '')
                
        partial_df.loc[row_index, ' ملخص'] = synopsis
    return partial_df


In [None]:
def remove_punctuation(partial_df):
    for row_index in range(partial_df.shape[0]):
        synopsis = partial_df.iloc[row_index, 2]
        if((synopsis) != 'nan'):
            synopsis = re.sub(r'\([^)]*\)', '', synopsis)
            synopsis = re.sub('".*?"', '', synopsis)
            synopsis = re.sub(r'[^\w\s]','',synopsis)

            partial_df.iloc[row_index, 2] = synopsis
    return partial_df

In [None]:
def tokenize_synopsis(partial_df):
    
    for row_index in range(partial_df.shape[0]):
        
        synopsis = partial_df.loc[row_index, ' ملخص']
        tokenized_synopsis = wordpunct_tokenize(synopsis)
        partial_df.loc[row_index, ' ملخص'] = tokenized_synopsis
    return partial_df
    

### Prepare Synopses for Doc2Vec

In [None]:
synopses_analysis_df_40s = \
synopses_added_40s_df[[ 'اسم الفيلم', ' تصنيف الفيلم', ' ملخص']]

In [None]:
synopses_analysis_df_40s = preprocess_synopses(synopses_added_40s_df \
                                              , synopses_analysis_df_40s)
synopses_analysis_df_40s = remove_punctuation(synopses_analysis_df_40s)
synopses_analysis_df_40s = remove_stop_words(synopses_analysis_df_40s)
synopses_analysis_df_40s = stem_synopsis(synopses_analysis_df_40s)

In [None]:
synopses_analysis_df_40s = tokenize_synopsis(synopses_analysis_df_40s)

In [None]:
synopses_analysis_df_40s.to_csv('1940s_data/dataset_40s.csv')

## *_1950s Movies_*

In [None]:
fifties_df.sample(10)

In [None]:
describe_dataframe(fifties_df)

In [None]:
fifties_df[' تصنيف الفيلم'].value_counts()

In [None]:
fifties_df[fifties_df.duplicated(subset = ['اسم الفيلم'])]

In [None]:
fifties_df.loc[fifties_df['اسم الفيلم'] == 'غرام في إستانبول (غرام في إسطنبول)']

In [None]:
fifties_df = change_empty_entries_to_NaN(fifties_df, ' تصنيف الفيلم')
fifties_df[' ملخص'] = fifties_df[' ملخص'].astype(str)

In [None]:
date_expanded_df_50s = expand_release_date(fifties_df)
date_expanded_df_50s = unify_date_format(date_expanded_df_50s)

In [None]:
date_expanded_df_50s = date_expanded_df_50s.drop(columns=[' تاريخ العرض'])
date_expanded_df_50s['التقييم'] = np.nan

In [None]:
foreign_filtered_df_50s = filter_foregin_movies(date_expanded_df_50s
                                               , '1950s_data/countries_1950s.txt')

In [None]:
ratings_added_df_50s = add_scraped_features(foreign_filtered_df_50s \
                                           , '1950s_data/1950s_ratings.txt'
                                           , 'التقييم')

In [None]:
genres_added_df_50s = add_scraped_features(ratings_added_df_50s \
                                           , '1950s_data/1950s_genres.txt'
                                           , ' تصنيف الفيلم')

In [None]:
synopses_added_50s_df = add_scraped_features(ratings_added_df_50s \
                                          , '1950s_data/1950s_synopses.txt' \
                                         , ' ملخص')

## *_1960s Movies_*

In [None]:
sixties_df.sample(10)

In [None]:
describe_dataframe(sixties_df)

In [None]:
sixties_df.loc[sixties_df['اسم الفيلم'] == 'شايف خير']

In [None]:
sixties_df[' تصنيف الفيلم'].value_counts()

In [None]:
sixties_df.loc[sixties_df[' تصنيف الفيلم'] == 'ﻭﺛﺎﺋﻘﻲ']

In [None]:
sixties_df = change_empty_entries_to_NaN(sixties_df, ' تصنيف الفيلم')
sixties_df[' ملخص'] = sixties_df[' ملخص'].astype(str)

_**Cleansing**_

In [None]:
# sixties_df.drop_duplicates(keep = 'first', inplace = True)
sixties_df.drop(inplace = True, index = \
               sixties_df.loc[sixties_df[' تصنيف الفيلم'] == 'ﺳﻴﺮﺓ ﺫاﺗﻴﺔ']
                .index)

In [None]:
sixties_df.loc[sixties_df['اسم الفيلم'] == 'الطريق']

In [None]:
sixties_df.dtypes

In [None]:
sixties_df.to_csv('1960s_data/1960s_no_duplicates.csv')

In [None]:
date_expanded_60s_df = expand_release_date(sixties_df)
date_expanded_60s_df = unify_date_format(date_expanded_60s_df)

In [None]:
date_expanded_60s_df = date_expanded_60s_df.drop(columns=[' تاريخ العرض'])

In [None]:
date_expanded_60s_df['التقييم'] = np.nan

In [None]:
date_expanded_1960s_df.to_csv('1960s_data/date_expanded_1960s.csv')

_**Add Webscraped Ratings**_

In [None]:
foreigns_filtered_df_60s = filter_foregin_movies(date_expanded_60s_df \
                                                , '1960s_data/1960s_countries.txt')

In [None]:
ratings_added_60s_df = add_scraped_features(date_expanded_60s_df \
                                    , '1960s_data/1960s_ratings.txt' \
                                    , 'التقييم')

In [None]:
ratings_added_60s_df.to_csv('1960s_data/ratings_added_df_60s.csv')

In [None]:
genre_added_60s_df = add_scraped_features(ratings_added_60s_df \
                                          , '1960s_data/1960s_genres.txt' \
                                         , ' تصنيف الفيلم')

In [None]:
genre_added_60s_df.to_csv('1960s_data/genre_added_60s_df.csv')

In [None]:
sixties_df.loc[sixties_df['اسم الفيلم'] == 'ورود ملونة']

In [None]:
synopses_added_60s_df = add_scraped_features(genre_added_60s_df \
                                          , '1960s_data/1960s_synopses.txt' \
                                         , ' ملخص')

## *_1970s Movies_*

_**EDA**_

In [None]:
describe_dataframe(seventies_df)

In [None]:
seventies_df[' ملخص'].isna().sum()

In [None]:
seventies_df[' تصنيف الفيلم'].value_counts()

_**Cleansing**_

In [None]:
seventies_df = change_empty_entries_to_NaN(seventies_df, ' تصنيف الفيلم')
seventies_df[' ملخص'] = seventies_df[' ملخص'].astype(str)

In [None]:
seventies_df[seventies_df.duplicated(subset = ['اسم الفيلم'])]

In [None]:
seventies_df[seventies_df.duplicated(subset = ['اسم الفيلم', ' تمثيل'])]
seventies_df.drop_duplicates(subset = ['اسم الفيلم', ' تمثيل'], inplace= True, keep = 'last')

In [None]:
seventies_df.drop(inplace = True, index = 306)
seventies_df.drop(inplace = True, index = 156)
seventies_df.drop(inplace = True, index = 256)
seventies_df.drop(inplace = True, index = 389)
seventies_df.drop(inplace = True, index = 259)
seventies_df.drop(inplace = True, index = 433)
seventies_df.drop(inplace = True, index = 2)
seventies_df.drop(inplace = True \
                  , index = seventies_df.loc[ \
                                    seventies_df[' تصنيف الفيلم'] == 'ﺳﻴﺮﺓ ﺫاﺗﻴﺔ'].index)


In [None]:
seventies_df[seventies_df.duplicated(subset = ['اسم الفيلم'])]

In [None]:
seventies_df.loc[seventies_df[' تصنيف الفيلم'] == 'ﺧﻴﺎﻝ ﻋﻠﻤﻲ']

In [None]:
seventies_df.loc[seventies_df['اسم الفيلم'] == 'سيدتي الجميلة']

In [None]:
date_expanded_70s_df = expand_release_date(seventies_df)
date_expanded_70s_df = unify_date_format(date_expanded_70s_df)

In [None]:
date_expanded_70s_df = date_expanded_70s_df.drop(columns=[' تاريخ العرض'])
date_expanded_70s_df['التقييم'] = np.nan

In [None]:
date_expanded_70s_df.to_csv('1970s_data/date_expanded_1970s.csv')

In [None]:
foreign_filtered_df_70s = \
    filter_foregin_movies(date_expanded_70s_df, '1970s_data/1970s_countries.txt')

In [None]:
foreign_filtered_df_70s.to_csv('1970s_data/foreign_movies_filtered.csv')

In [None]:
ratings_added_1970s_df = add_scraped_features(foreign_filtered_df_70s \
                                    , '1970s_data/1970s_ratings.txt' \
                                    , 'التقييم')

In [None]:
ratings_added_1970s_df.to_csv('1970s_data/ratings_added_70s_df.csv')

In [None]:
genre_added_df_70s = add_scraped_features(ratings_added_1970s_df \
                                          , '1970s_data/1970s_genres.txt'
                                         ,' تصنيف الفيلم')

In [None]:
synopses_added_70s_df = add_scraped_features(genre_added_df_70s \
                                          , '1970s_data/1970s_synopses.txt' \
                                         , ' ملخص')

## So far, 1970s movies have their genres, ratings, and are filtered from foreign movies.

In [None]:
foreigns_filtered_df_70s.to_csv('1970s_data/foreigns_filtered_df_70s.csv')

In [None]:
date_expanded_70s_df

In [None]:
date_expanded_70s_df.shape[0]