# Import Pandas

In [1]:
import pandas as pd

# Importing Data

In [6]:
names = ['id', 'title', 'year', 'rating', 'votes', 'length', 'genres']
data = pd.read_csv('imdb_top_10000.txt', sep="\t", names=names, index_col=0)

# Exploring our Data

In [None]:
data.head()

In [None]:
data.head(3)

In [None]:
data.tail()

In [None]:
data.info()

In [None]:
data.describe()

# Exporting Data

In [13]:
data.to_csv('test.csv', header=True, index=True, sep=',')

# Sorting Data

In [None]:
data.sort_values(by='rating')

In [None]:
data.sort_values(by='rating', ascending=False)

# Creating Data Frames from Scratch

In [16]:
sample_data = {
    'tv': [230, 44, 17],
    'radio': [37, 39, 45],
    'news': [69, 45, 69],
    'sales': [22, 10, 9]
}

In [17]:
data2 = pd.DataFrame(sample_data)

In [None]:
data2

In [19]:
del data2

In [None]:
data2

# Selecting Data

In [None]:
data['title']

In [None]:
data[['title', 'year']]

In [None]:
data['rating'].mean()

In [None]:
data['rating'].max()

In [None]:
data['rating'].min()

In [None]:
data['genres'].unique()

In [None]:
data['rating'].value_counts()

In [None]:
data['rating'].value_counts().sort_index()

In [None]:
data['rating'].value_counts().sort_index(ascending=False)

# Plotting

In [33]:
%matplotlib inline

In [None]:
data.plot()

In [None]:
data.plot(kind='scatter', x='rating', y='votes')

In [None]:
data.plot(kind='scatter', x='rating', y='votes', alpha=0.3)

In [None]:
data['rating'].plot(kind='hist')

In [38]:
import seaborn as sns

In [None]:
sns.lmplot(x='rating', y='votes', data=data)

In [None]:
sns.pairplot(data)

# Ordinary Least Squares (OLS) Regression

In [43]:
import statsmodels.api as sm

In [44]:
results = sm.OLS(data['votes'], data['rating']).fit()

In [None]:
results.summary()

# Advanced Data Selection

In [None]:
data[data['year'] > 1995] 

In [None]:
data['year'] > 1995

In [None]:
data[data['year'] == 1966]

In [None]:
data[(data['year'] > 1995) & (data['year'] < 2000)] 

In [None]:
data[(data['year'] > 1995) | (data['year'] < 2000)] 

In [None]:
data[(data['year'] > 1995) & (data['year'] < 2000)].sort_values(by='rating', ascending=False).head(10)

# Grouping

In [None]:
data.groupby(data['year'])['rating'].mean()

In [None]:
data.groupby(data['year'])['rating'].max()

In [None]:
data.groupby(data['year'])['rating'].min()

# Challenges

1. What was the highest scoring movie in 1996?
2. In what year was the highest rated movie of all time made?
3. What five movies have the most votes ever?
4. What year in the 1960s had the highest average movie rating?

In [None]:
data[data['year'] == 1996].sort_values(by='rating', ascending=False).head()

In [None]:
data[data['rating'] == data['rating'].max()]

In [None]:
data.sort_values(by='votes', ascending=False).head()

In [None]:
data[(data['year'] >= 1960) & (data['year'] < 1970)].groupby(data['year'])['rating'].mean()

# Cleaning Data

In [83]:
data['formatted title'] = data['title'].str[:-7]

In [None]:
data.head()

In [90]:
data['formatted title'] = data['title'].str.split(' \(').str[0]

In [None]:
data.head()

In [98]:
data['formatted length'] = data['length'].str.replace(' mins.', '').astype('int')

In [None]:
data.head()

In [None]:
sns.pairplot(data)

In [None]:
data[data['formatted length'] == 0]