In [1]:
from azureml.core import Workspace, Dataset
import azureml.dataprep as dprep
import json

config = json.load(open("config.json"))

In [20]:
ws = Workspace.get("demoworkspace", subscription_id=config["subscriptionId"])

## Auto read files

No need to specify separators.

In [21]:
titanic_ds = Dataset.auto_read_files("./titanic.csv")

## View sample of data

Can use the `head` method similar to pands.

In [24]:
titanic_ds.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Get random sample of data

In [29]:
titanic_ds.sample('simple_random', {'probability':0.3 }).to_pandas_dataframe().head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
1,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0,,S
2,21,0,2,"Fynney, Mr. Joseph J",male,35.0,0,0,239865,26.0,,S
3,22,1,2,"Beesley, Mr. Lawrence",male,34.0,0,0,248698,13.0,D56,S
4,26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...",female,38.0,1,5,347077,31.3875,,S


## Get descriptive statistics

The `get_profile` method is similar to pandas' `describe` method, but you get more information.

In [6]:
titanic_ds.get_profile()

Unnamed: 0,Type,Min,Max,Count,Missing Count,Not Missing Count,Percent Missing,Error Count,Empty Count,Unique Values,0.1% Quantile (est.),1% Quantile (est.),5% Quantile (est.),25% Quantile (est.),50% Quantile (est.),75% Quantile (est.),95% Quantile (est.),99% Quantile (est.),99.9% Quantile (est.),Mean,Standard Deviation,Variance,Skewness,Kurtosis
PassengerId,FieldType.INTEGER,1,891,891.0,0.0,891.0,0.0,0.0,0.0,891,1.391,89.6,87.5,223.25,446.0,668.75,846.95,882.59,890.609,446.0,257.354,66231.0,0.0,-1.20404
Survived,FieldType.INTEGER,0,1,891.0,0.0,891.0,0.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.383838,0.486592,0.236772,0.476913,-1.77454
Pclass,FieldType.INTEGER,1,3,891.0,0.0,891.0,0.0,0.0,0.0,3,1.0,1.0,1.0,2.0,3.0,3.0,3.0,3.0,3.0,2.30864,0.836071,0.699015,-0.628426,-1.28343
Name,FieldType.STRING,"Abbing, Mr. Anthony","van Melkebeke, Mr. Philemon",891.0,0.0,891.0,0.0,0.0,0.0,891,,,,,,,,,,,,,,
Sex,FieldType.STRING,female,male,891.0,0.0,891.0,0.0,0.0,0.0,2,,,,,,,,,,,,,,
Age,FieldType.DECIMAL,0.42,80,891.0,177.0,714.0,0.198653,0.0,0.0,89,0.4735,13.5111,12.6667,20.28,28.0,38.2,56.3067,67.44,78.716,29.6991,14.5265,211.019,0.387474,0.159767
SibSp,FieldType.INTEGER,0,8,891.0,0.0,891.0,0.0,0.0,0.0,7,0.0,0.0,0.0,0.0,0.0,1.0,2.98333,5.0,8.0,0.523008,1.10274,1.21604,3.68292,17.7269
Parch,FieldType.INTEGER,0,6,891.0,0.0,891.0,0.0,0.0,0.0,7,0.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,5.609,0.381594,0.806057,0.649728,2.73987,9.68808
Ticket,FieldType.STRING,110152,WE/P 5735,891.0,0.0,891.0,0.0,0.0,0.0,681,,,,,,,,,,,,,,
Fare,FieldType.DECIMAL,0,512.329,891.0,0.0,891.0,0.0,0.0,0.0,248,0.0,7.54406,7.5156,7.90776,14.456,31.066,112.451,255.394,512.329,32.2042,49.6934,2469.44,4.77121,33.1231


# Allow versions and dataset management

In [30]:
titanic_def = titanic_ds.get_definition()

## Keep columns

Or can use the `drop_columns` method.

In [8]:
titanic_def = titanic_def.keep_columns(["Survived", "Pclass", "Sex", "Age", "SibSp"])

In [9]:
titanic_def.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp
0,0,3,male,22.0,1
1,1,1,female,38.0,1
2,1,3,female,26.0,0
3,1,1,female,35.0,1
4,0,3,male,35.0,0
5,0,3,male,,0
6,0,1,male,54.0,0
7,0,3,male,2.0,3
8,1,3,female,27.0,0
9,1,2,female,14.0,1


## Replace values

In [10]:
titanic_def = titanic_def.replace("Sex", "male", 0)
titanic_def = titanic_def.replace("Sex", "female", 1)

titanic_def.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp
0,0,3,0,22.0,1
1,1,1,1,38.0,1
2,1,3,1,26.0,0
3,1,1,1,35.0,1
4,0,3,0,35.0,0
5,0,3,0,,0
6,0,1,0,54.0,0
7,0,3,0,2.0,3
8,1,3,1,27.0,0
9,1,2,1,14.0,1


## Fill missing data by imputed function

In [12]:
age_mean = dprep.ImputeColumnArguments(column_id="Age", impute_function=dprep.ReplaceValueFunction.MEAN)

mean_builder = titanic_def.builders.impute_missing_values(impute_columns=[age_mean])

In [13]:
mean_builder.learn()

new_titanic_def = mean_builder.to_dataflow()

new_titanic_def.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp
0,0,3,0,22.0,1
1,1,1,1,38.0,1
2,1,3,1,26.0,0
3,1,1,1,35.0,1
4,0,3,0,35.0,0
5,0,3,0,29.699118,0
6,0,1,0,54.0,0
7,0,3,0,2.0,3
8,1,3,1,27.0,0
9,1,2,1,14.0,1
