# Data Cleaning and Features Engineering (Part 3)
### Merge all features into one dataframe

In [1]:
import pandas as pd

### Load all three features

In [2]:
movie_df = pd.read_csv(r".\features_movie_info.csv", sep =',')
sentiment_df = pd.read_csv(r".\features_reviews_5000_sentiment_score.csv", sep =',')
box_df = pd.read_csv(r".\features_box_office.csv", sep =',')
movie_df.shape, sentiment_df.shape, box_df.shape

((4878, 31), (4854, 7), (4137, 7))

In [3]:
sentiment_df.head()

Unnamed: 0,url_id,ss_mean,ss_median,ss_p25,ss_p75,ss_std,ss_count
0,https://www.rottentomatoes.com/m/10002519-brea...,1.0,1.0,1.0,1.0,0.0,6
1,https://www.rottentomatoes.com/m/1000_times_go...,0.818182,1.0,1.0,1.0,0.389249,55
2,https://www.rottentomatoes.com/m/10011489-bananas,0.666667,1.0,0.25,1.0,0.516398,6
3,https://www.rottentomatoes.com/m/1001_grams,0.909091,1.0,1.0,1.0,0.294245,22
4,https://www.rottentomatoes.com/m/1003757-cat_p...,0.913043,1.0,1.0,1.0,0.284885,46


### Merge all features

In [4]:
features_df = pd.merge(movie_df, sentiment_df, left_on = "url", right_on = "url_id", how = 'left')
features_df = pd.merge(features_df, box_df, left_on = "title", right_on = "movie_title", how = "left")
features_df = features_df.set_index("title")

In [5]:
features_df.columns

Index(['runtime', 'audience_score', 'tomatometer', 'tomatometer_count', 'url',
       'user_rating_count', 'genre_Action', 'genre_Adventure', 'genre_Comedy',
       'genre_Fantasy', 'genre_Horror', 'genre_Romance', 'genre_Sci-fi',
       'genre_Special Interest', 'genre_Western', 'genre_FamilyKids',
       'genre_AnimationManga', 'genre_FitnessSports', 'genre_DramaTele',
       'genre_MusicalPerfarts', 'genre_ClassicsCult', 'genre_ArthouseInter',
       'genre_ThrillMysSusp', 'genre_HistDocument', 'G', 'NC17', 'NR', 'PG',
       'PG-13', 'R', 'url_id', 'ss_mean', 'ss_median', 'ss_p25', 'ss_p75',
       'ss_std', 'ss_count', 'movie_title', 'domestic_gross',
       'domestic_opening', 'foreign_gross', 'markets', 'total_gross',
       'markets_missing'],
      dtype='object')

### Drop id related columns

In [6]:
# drop id related columns
features_df = features_df.drop(["url", "url_id", "movie_title"], axis = 1)

In [7]:
# rename tomatometer to movie_score
features_df = features_df.rename(columns = {"tomatometer": "movie_score"})

In [8]:
features_df["audience_score_positive"] = features_df["audience_score"] > features_df["audience_score"].median()
features_df["movie_score_positive"] = features_df["movie_score"] > features_df["movie_score"].median()

### Columns Metadata

#### Features/Attributes/Predictors:
1. 'runtime' - Movie length in minutes
2. 'tomatometer_count' - Total number of ratings provided by movie critics in rottentomatoes.com
3. 'user_rating_count' - Total number of ratings provided by verified users in rottentomatoes.com
4. 'genre_Action', 'genre_Adventure', 'genre_Comedy', 'genre_Fantasy', 'genre_Horror', 'genre_Romance', 'genre_Sci-fi', 'genre_Special Interest', 'genre_Western', 'genre_FamilyKids', 'genre_AnimationManga', 'genre_FitnessSports', 'genre_DramaTele', 'genre_MusicalPerfarts', 'genre_ClassicsCult', 'genre_ArthouseInter',       'genre_ThrillMysSusp', 'genre_HistDocument' - The genres of the film  
5. 'G', 'NC17', 'NR', 'PG', 'PG-13', 'R' - The MPAA film rating  
6. 'ss_mean', 'ss_median', 'ss_p25', 'ss_p75', 'ss_std', 'ss_count' - Aggregate sentiment scores  
7. 'domestic_gross', 'domestic_opening', 'foreign_gross', 'markets', 'total_gross', 'markets_missing' - Box office related features  

#### Target:
1. 'audience_score': The audience rating in rottentomatoes.com  
2. 'movie_score': The tomatometer rating in rottentomatoes.com  
3. 'audience_score_indicator': A binary indicator that indicates whether the movie is good or not (in the perspective of the audience)
4. 'movie_score_indicator': A binary indicator that indicates whether the movie is good or not (in the perspective of the movie critics)

### Export to CSV

In [9]:
features_df.to_csv("training_data.csv")