In [0]:
%pyspark

from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

habrData = spark.read.option("header", True)\
.option("inferSchema", True)\
.csv("/user/admin/habr_data.csv")\
.withColumn("rating", col("rating").cast(IntegerType()))\
.cache()

habrData.printSchema()
habrData.show()

In [1]:
val habrData = spark.read.option("header", true)
.option("inferSchema", "true")
.csv("/user/admin/habr_data.csv").cache

habrData.printSchema
habrData.show

In [2]:
z.show(
    habrData
    )

In [3]:
habrData.select("rating").orderBy("rating").show

In [4]:
%pyspark

trainDF, testDF = spark.read.option("header", True)\
.csv("/user/admin/habr_data.csv")\
.randomSplit([.8, .2], seed=42)

trainDF.coalesce(2).write.mode("overwrite").saveAsTable("habr.train")
testDF.coalesce(2).write.mode("overwrite").saveAsTable("habr.test")

print("There are " + str(trainDF.count()) + " rows in the training set, and " + str(testDF.count()) + " in the test set")

In [5]:
%pyspark

from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import Tokenizer, RegexTokenizer, HashingTF, IDF
from pyspark.sql.types import IntegerType
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf

# Prepare training data from a list of (label, features) tuples.
train = spark.table("habr.train")\
.selectExpr("title", "cast(rating as Long) rating")\
.na.drop("any")

# Prepare test data
test = spark.table("habr.test")\
.selectExpr(" title", "cast(rating as Long) rating")\
.na.drop("any")

tokenizer = Tokenizer(inputCol="title", outputCol="title_words")

regexTokenizer = RegexTokenizer(inputCol="title", outputCol="title_words", pattern="[^a-zа-яё]", gaps=True)\
.setMinTokenLength(3)

# alternatively, pattern="\\w+", gaps(False)


# tokenized = tokenizer.transform(train)
# tokenized.select("title", "title_words")\
#     .withColumn("tokens", countTokens(col("title_words"))).show(truncate=False)

regexTokenized = regexTokenizer.transform(train)

# regexTokenized.select("title", "title_words").withColumn("tokens", countTokens(col("title_words"))).show(truncate=False)
    

hashingTF = HashingTF(inputCol="title_words", outputCol="rawFeatures", numFeatures=200000)
featurizedData = hashingTF.transform(regexTokenized)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

# rescaledData.select("rating", "features").show()


lr = LinearRegression(maxIter=10, regParam=0.1, featuresCol='features', labelCol='rating', predictionCol='prediction')

p = Pipeline(stages=[])

pipeline = Pipeline(stages=[regexTokenizer, hashingTF, idf, lr])
 

model = pipeline.fit(train)
prediction = model.transform(test)

from pyspark.ml.evaluation import RegressionEvaluator

regressionEvaluator = RegressionEvaluator(
    predictionCol="prediction",
    labelCol="rating",
    metricName="rmse")
    
rmse = regressionEvaluator.evaluate(prediction)
print("RMSE is " + str(rmse))


In [6]:
%pyspark

tokenizer.transform(train)\
.show(20, False)

In [7]:
%pyspark
RegexTokenizer(inputCol="title", outputCol="title_words", pattern="[^a-zа-яё]", gaps=True)\
.setMinTokenLength(3).transform(train)\
.show(truncate=False)

In [8]:
%pyspark

featurizedData.select("title_words", "rawFeatures" ).show(20, False)

In [9]:
%pyspark

idf.show()

In [10]:
%pyspark
rescaledData.select("features").show(10, False)

In [11]:
%pyspark

hashingTF = HashingTF(inputCol="title_words", outputCol="rawFeatures", numFeatures=100000)
featurizedData = hashingTF.transform(regexTokenized)

featurizedData.select("title", "rawFeatures").show(100, False)

In [12]:
Seq("Reverse Reverse Reverse").toDF("title").write.mode("overwrite").saveAsTable("habr.df")

In [13]:
%pyspark

df = spark.table("habr.df")
df.show()

hashingTF.transform(regexTokenizer.transform(df)).show(1, False)

In [14]:
%pyspark

regexTokenizer = RegexTokenizer(inputCol="title", outputCol="title_words", pattern="[^a-zа-яё]", gaps=True)\
.setMinTokenLength(3)

regexTokenizer.transform(train).show(100, False)

In [17]:
%pyspark

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit

paramGrid = ParamGridBuilder()  \
    .addGrid(lr.regParam, [0.01, 0.1])\
    .addGrid(hashingTF.numFeatures, [1000, 2000]) \
    .build()
    # .addGrid(lr.regParam, [0.1, 0.01]) \

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(),
                          numFolds=3)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.

cvModel = crossval.fit(train.withColumn("label", col("rating")))


# tvs = TrainValidationSplit(estimator=pipeline,
#                           estimatorParamMaps=paramGrid,
#                           evaluator=regressionEvaluator,
#                           trainRatio=0.8)

# tvsModel = tvs.fit(train)

# predictionTvs = tvsModel.transform(test)

prediction = cvModel.transform(test)

rmse = regressionEvaluator.evaluate(prediction)
print("RMSE is " + str(rmse))


In [18]:
%pyspark

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.sql.functions import col

# Prepare training and test data.
data = rescaledData.withColumn("label", col("rating"))
train, test = data.randomSplit([0.9, 0.1], seed=12345)

lr = LinearRegression(maxIter=10)

# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using
# the evaluator.
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.elasticNetParam, [0.0, 1.0])\
    .build()

# In this case the estimator is simply the linear regression.
# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

# Run TrainValidationSplit, and choose the best set of parameters.
model = tvs.fit(train)

# Make predictions on test data. model is the model with combination of parameters
# that performed best.
model.bestModel.transform(test)\
    .select("features", "label", "prediction")\
    .show()

In [19]:
%pyspark

model.bestModel.coefficients

* построить распределение статей в датасете по rating с bin_size = 10
* написать функцию ratingToClass(rating: Int): String, которая определяет категорию статьи ( A, B, C, D) на основе рейтинга. Границы для классов подобрать самостоятельно.
* добавить к датасету категориальную фичу "rating_class".  При добавлении колонки использовать udf из функции в предыдущем пункте
* Построить модель логистической регрессии (one vs all)  для классификации статей по рассчитанным классам.
* Получить F1 score для получившейся модели