In [1]:
# demonstrating Google AutoML by training a sentiment analyzer model using course reviews of students on Coursera
# # dataset retreived from Kaggle https://www.kaggle.com/septa97/100k-courseras-course-reviews-dataset

In [2]:
import pandas as pd

In [3]:
dataset_file = "100k-courseras-course-reviews-dataset/reviews_by_course.csv"

In [4]:
data = pd.read_csv(dataset_file)

In [5]:
print (data.Label.describe())

count    140320.000000
mean          4.619185
std           0.821347
min           1.000000
25%           5.000000
50%           5.000000
75%           5.000000
max           5.000000
Name: Label, dtype: float64


In [6]:
print (data.head())

     CourseId                                             Review  Label
0  2-speed-it                                             BOring      1
1  2-speed-it                                            Bravo !      5
2  2-speed-it                                           Very goo      5
3  2-speed-it  Great course - I recommend it for all, especia...      5
4  2-speed-it    One of the most useful course on IT Management!      5


In [7]:
Y = data.Label
X = data.drop('Label', axis=1)

In [8]:
from google.cloud import automl_v1beta1 as automl

In [9]:
# google cloud project information
project_id = 'automltarek'
compute_region = 'us-central1'
dataset_name = 'coursera_reviews'
multilabel = True

In [10]:
from google.cloud import automl_v1beta1 as automl

client = automl.AutoMlClient()

# A resource that represents Google Cloud Platform location.
project_location = client.location_path(project_id, compute_region)

# Classification type is assigned based on multilabel value.
classification_type = "MULTICLASS"
if multilabel:
    classification_type = "MULTILABEL"

# Specify the text classification type for the dataset.
dataset_metadata = {"classification_type": classification_type}

# Set dataset name and metadata.
my_dataset = {
    "display_name": dataset_name,
    "text_classification_dataset_metadata": dataset_metadata,
}

# Create a dataset with the dataset metadata in the region.
dataset = client.create_dataset(project_location, my_dataset)

# Display the dataset information.
print("Dataset name: {}".format(dataset.name))
print("Dataset id: {}".format(dataset.name.split("/")[-1]))
print("Dataset display name: {}".format(dataset.display_name))
print("Text classification dataset metadata:")
print("\t{}".format(dataset.text_classification_dataset_metadata))
print("Dataset example count: {}".format(dataset.example_count))
print("Dataset create time:")
print("\tseconds: {}".format(dataset.create_time.seconds))
print("\tnanos: {}".format(dataset.create_time.nanos))

Dataset name: projects/594223496258/locations/us-central1/datasets/TCN4741386458792227252
Dataset id: TCN4741386458792227252
Dataset display name: coursera_reviews
Text classification dataset metadata:
	classification_type: MULTILABEL

Dataset example count: 0
Dataset create time:
	seconds: 1544556511
	nanos: 631486000


In [None]:
# import the data 
path = 'gs://automltarek-lcm/coursera_reviews/reviews_course_mod.csv'
dataset_id = "TCN4741386458792227252"
# Get the full path of the dataset.
dataset_full_id = client.dataset_path(
    project_id, compute_region, dataset_id
)

# Get the multiple Google Cloud Storage URIs.
input_uris = path.split(",")
input_config = {"gcs_source": {"input_uris": input_uris}}

# Import the dataset from the input URI.
response = client.import_data(dataset_full_id, input_config)

print("Processing import...")
# synchronous check of operation status.
print("Data imported. {}".format(response.result()))